import sys
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from cuml.linear_model import LogisticRegression as cuMLLogisticRegression
print('NOVA_HOME is at', os.getenv('NOVA_HOME'))
sys.path.insert(1, os.getenv('NOVA_HOME'))
%load_ext autoreload
%autoreload 2
from utils import *
NOVA_HOME is at /home/projects/hornsteinlab/Collaboration/NOVA_GAL/NOVA NOVA_HOME: /home/projects/hornsteinlab/Collaboration/NOVA_GAL/NOVA
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from cuml.ensemble import RandomForestClassifier as cuRF
from sklearn.ensemble import ExtraTreesClassifier
dataset_config = {
"path_to_embeddings": "/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen",
"multiplexed": False,
"config_fmt": "NIH_UMAP1_DatasetConfig_B{batch}",
"config_dir": "manuscript/manuscript_figures_data_config",
}
## Baseline
run_baseline_model(
dataset_config= dataset_config,
batches=[1, 2, 3],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:19:40 INFO: [load_embeddings] multiplex=False 2025-08-20 16:19:40 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:19:40 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 16:19:40 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
Loading all batches...
2025-08-20 16:19:42 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:19:43 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:19:44 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:19:44 INFO: [load_embeddings] embeddings shape: (115587, 192) 2025-08-20 16:19:44 INFO: [load_embeddings] labels shape: (115587,) 2025-08-20 16:19:44 INFO: [load_embeddings] example label: DAPI_WT_Untreated 2025-08-20 16:19:44 INFO: [load_embeddings] paths shape: (115587,) 2025-08-20 16:19:44 INFO: [load_embeddings] multiplex=False 2025-08-20 16:19:44 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:19:44 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 16:19:44 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen 2025-08-20 16:19:46 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:19:47 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:19:47 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:19:48 INFO: [load_embeddings] embeddings shape: (94059, 192) 2025-08-20 16:19:48 INFO: [load_embeddings] labels shape: (94059,) 2025-08-20 16:19:48 INFO: [load_embeddings] example label: DCP1A_WT_Untreated 2025-08-20 16:19:48 INFO: [load_embeddings] paths shape: (94059,) 2025-08-20 16:19:48 INFO: [load_embeddings] multiplex=False 2025-08-20 16:19:48 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:19:48 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 16:19:48 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen 2025-08-20 16:19:50 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:19:50 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:19:51 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:19:51 INFO: [load_embeddings] embeddings shape: (87130, 192) 2025-08-20 16:19:51 INFO: [load_embeddings] labels shape: (87130,) 2025-08-20 16:19:51 INFO: [load_embeddings] example label: TUJ1_WT_Untreated 2025-08-20 16:19:51 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].
=== Batch [2] ===
Train: (115587, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (94059, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
precision recall f1-score support
0 0.90 0.94 0.92 2123
1 0.99 0.95 0.97 2536
2 0.94 0.93 0.94 2079
3 1.00 1.00 1.00 24823
4 0.89 0.82 0.86 2319
5 0.86 0.89 0.88 2608
6 0.98 0.92 0.95 2236
7 0.98 0.98 0.98 2265
8 0.98 0.97 0.98 2110
9 0.93 0.86 0.90 2104
10 0.93 0.93 0.93 2243
11 0.95 0.98 0.97 2236
12 0.98 0.98 0.98 2227
13 0.95 0.93 0.94 2360
14 0.96 0.94 0.95 1916
15 0.95 0.95 0.95 2074
16 0.92 0.86 0.89 1818
17 0.86 0.88 0.87 1631
18 0.94 0.97 0.95 2090
19 0.98 0.88 0.93 2019
20 0.88 0.96 0.91 1923
21 0.78 0.84 0.81 1654
22 0.89 0.93 0.91 1934
23 0.93 0.94 0.94 2086
24 0.96 0.97 0.96 2114
25 0.99 0.99 0.99 18531
accuracy 0.96 94059
macro avg 0.93 0.93 0.93 94059
weighted avg 0.96 0.96 0.96 94059
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (115587, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (87130, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
precision recall f1-score support
0 0.86 0.89 0.87 1850
1 0.97 0.59 0.73 2044
2 0.95 0.95 0.95 2332
3 0.99 1.00 1.00 22599
4 0.72 0.74 0.73 1901
5 0.65 0.56 0.60 1492
6 0.98 0.85 0.91 2095
7 0.96 0.95 0.95 2384
8 0.98 0.97 0.97 2145
9 0.96 0.82 0.89 2358
10 0.96 0.95 0.96 2340
11 0.97 0.96 0.96 2095
12 0.93 0.99 0.96 2085
13 0.85 0.99 0.92 2117
14 0.94 0.92 0.93 1751
15 0.97 0.88 0.92 1855
16 0.86 0.52 0.65 1623
17 0.55 0.71 0.62 1903
18 0.83 0.99 0.90 2085
19 0.98 0.90 0.94 2152
20 0.69 0.98 0.81 1857
21 0.61 0.28 0.38 1484
22 0.92 0.92 0.92 1836
23 0.82 0.93 0.87 2078
24 0.97 0.97 0.97 2200
25 0.94 0.98 0.96 16469
accuracy 0.92 87130
macro avg 0.88 0.85 0.86 87130
weighted avg 0.92 0.92 0.91 87130
=== Overall Accuracy ===
0.9389480095328129 [0.9595785623916904, 0.9183174566739355]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.995381 0.914926 0.997184 0.879294 0.998091
CLTC_WT_Untreated 0.994310 0.791048 0.999581 0.979984 0.994608
Calreticulin_WT_Untreated 0.997268 0.943550 0.998608 0.944192 0.998591
DAPI_WT_Untreated 0.998565 1.000000 0.998056 0.994547 1.000000
DCP1A_WT_Untreated 0.990717 0.785782 0.995604 0.809966 0.994895
FMRP_WT_Untreated 0.990320 0.771463 0.995387 0.794724 0.994712
FUS_WT_Untreated 0.996865 0.887786 0.999536 0.979119 0.997258
G3BP1_WT_Untreated 0.998195 0.962357 0.999139 0.967142 0.999009
GM130_WT_Untreated 0.998830 0.968743 0.999554 0.981195 0.999249
KIF5A_WT_Untreated 0.994928 0.839534 0.998851 0.948595 0.995960
LAMP1_WT_Untreated 0.997196 0.944141 0.998573 0.944966 0.998550
MitoTracker_WT_Untreated 0.998278 0.971369 0.998937 0.957224 0.999299
NCL_WT_Untreated 0.998537 0.985622 0.998852 0.954413 0.999649
NEMO_WT_Untreated 0.996429 0.960688 0.997335 0.901299 0.999002
P54_WT_Untreated 0.997610 0.929915 0.999009 0.950920 0.998553
PEX14_WT_Untreated 0.997362 0.917282 0.999137 0.959276 0.998168
PML_WT_Untreated 0.992820 0.700378 0.998481 0.899254 0.994224
PSD95_WT_Untreated 0.988537 0.792869 0.992429 0.675669 0.995865
PURA_WT_Untreated 0.996440 0.981078 0.996803 0.878593 0.999552
Phalloidin_WT_Untreated 0.997053 0.887317 0.999638 0.983001 0.997351
SNCA_WT_Untreated 0.993416 0.968783 0.993941 0.773063 0.999331
SQSTM1_WT_Untreated 0.989000 0.575207 0.996293 0.732252 0.992542
TDP43_WT_Untreated 0.996523 0.927056 0.997999 0.907792 0.998449
TIA1_WT_Untreated 0.995292 0.935399 0.996701 0.869614 0.998478
TOMM20_WT_Untreated 0.998433 0.967548 0.999186 0.966651 0.999208
TUJ1_WT_Untreated 0.991169 0.989229 0.991634 0.965882 0.997406
Macro Average 0.995364 0.896118 0.997556 0.907639 0.997616
{'Accuracy': 0.99536438151917,
'Sensitivity': 0.8961180450904996,
'Specificity': 0.9975556871415043,
'PPV': 0.9076394777540618,
'NPV': 0.99761554225463}
## Baseline
run_baseline_model(
dataset_config= dataset_config,
batches=[1, 2, 3],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
)
2025-08-19 14:34:37 INFO: [load_embeddings] multiplex=False 2025-08-19 14:34:37 INFO: [load_embeddings] experiment_type = NIH 2025-08-19 14:34:37 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-19 14:34:37 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
Loading all batches...
2025-08-19 14:34:39 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-19 14:34:40 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-19 14:34:41 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-19 14:34:41 INFO: [load_embeddings] embeddings shape: (115587, 192) 2025-08-19 14:34:41 INFO: [load_embeddings] labels shape: (115587,) 2025-08-19 14:34:41 INFO: [load_embeddings] example label: DAPI_WT_Untreated 2025-08-19 14:34:41 INFO: [load_embeddings] paths shape: (115587,) 2025-08-19 14:34:41 INFO: [load_embeddings] multiplex=False 2025-08-19 14:34:41 INFO: [load_embeddings] experiment_type = NIH 2025-08-19 14:34:41 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-19 14:34:41 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen 2025-08-19 14:34:43 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-19 14:34:44 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-19 14:34:45 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-19 14:34:45 INFO: [load_embeddings] embeddings shape: (94059, 192) 2025-08-19 14:34:45 INFO: [load_embeddings] labels shape: (94059,) 2025-08-19 14:34:45 INFO: [load_embeddings] example label: DCP1A_WT_Untreated 2025-08-19 14:34:45 INFO: [load_embeddings] paths shape: (94059,) 2025-08-19 14:34:45 INFO: [load_embeddings] multiplex=False 2025-08-19 14:34:45 INFO: [load_embeddings] experiment_type = NIH 2025-08-19 14:34:45 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-19 14:34:45 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen 2025-08-19 14:34:47 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-19 14:34:48 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-19 14:34:49 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-19 14:34:49 INFO: [load_embeddings] embeddings shape: (87130, 192) 2025-08-19 14:34:49 INFO: [load_embeddings] labels shape: (87130,) 2025-08-19 14:34:49 INFO: [load_embeddings] example label: TUJ1_WT_Untreated 2025-08-19 14:34:49 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [2, 3], Testing on: [1].
=== Batch [1] ===
Train: (181189, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (115587, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
DCP1A_WT_Untreated: 4220
SQSTM1_WT_Untreated: 3138
ANAX11_WT_Untreated: 3973
TUJ1_WT_Untreated: 35000
DAPI_WT_Untreated: 47422
Calreticulin_WT_Untreated: 4411
NEMO_WT_Untreated: 4477
SNCA_WT_Untreated: 3780
TOMM20_WT_Untreated: 4314
KIF5A_WT_Untreated: 4462
Phalloidin_WT_Untreated: 4171
PEX14_WT_Untreated: 3929
GM130_WT_Untreated: 4255
MitoTracker_WT_Untreated: 4331
NCL_WT_Untreated: 4312
CLTC_WT_Untreated: 4580
FMRP_WT_Untreated: 4100
PSD95_WT_Untreated: 3534
PML_WT_Untreated: 3441
G3BP1_WT_Untreated: 4649
TDP43_WT_Untreated: 3770
P54_WT_Untreated: 3667
PURA_WT_Untreated: 4175
TIA1_WT_Untreated: 4164
FUS_WT_Untreated: 4331
LAMP1_WT_Untreated: 4583
precision recall f1-score support
0 0.93 0.94 0.94 2614
1 0.95 0.95 0.95 2439
2 0.94 0.95 0.95 3056
3 1.00 1.00 1.00 30428
4 0.89 0.90 0.90 2364
5 0.89 0.89 0.89 2913
6 0.96 0.98 0.97 2728
7 0.98 0.98 0.98 2842
8 0.97 0.98 0.97 2371
9 0.79 0.90 0.84 2622
10 0.96 0.95 0.95 3067
11 0.99 0.98 0.98 2728
12 1.00 0.98 0.99 2709
13 0.96 0.97 0.97 2935
14 0.97 0.97 0.97 2622
15 0.94 0.97 0.96 2505
16 0.83 0.90 0.87 2297
17 0.89 0.82 0.85 2101
18 0.99 0.96 0.97 2712
19 0.88 0.98 0.93 2219
20 0.96 0.84 0.90 2454
21 0.83 0.72 0.77 2651
22 0.95 0.94 0.95 2534
23 0.94 0.91 0.93 2712
24 0.97 0.99 0.98 2363
25 0.99 0.99 0.99 22601
accuracy 0.96 115587
macro avg 0.94 0.94 0.94 115587
weighted avg 0.96 0.96 0.96 115587
Training on Batches: [1, 3], Testing on: [2].
=== Batch [2] ===
Train: (202717, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (94059, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
DAPI_WT_Untreated: 53027
TUJ1_WT_Untreated: 39070
LAMP1_WT_Untreated: 5407
DCP1A_WT_Untreated: 4265
TDP43_WT_Untreated: 4370
P54_WT_Untreated: 4373
SNCA_WT_Untreated: 4311
CLTC_WT_Untreated: 4483
PEX14_WT_Untreated: 4360
PURA_WT_Untreated: 4797
G3BP1_WT_Untreated: 5226
Phalloidin_WT_Untreated: 4371
NEMO_WT_Untreated: 5052
SQSTM1_WT_Untreated: 4135
PML_WT_Untreated: 3920
GM130_WT_Untreated: 4516
Calreticulin_WT_Untreated: 5388
KIF5A_WT_Untreated: 4980
FMRP_WT_Untreated: 4405
NCL_WT_Untreated: 4794
TOMM20_WT_Untreated: 4563
FUS_WT_Untreated: 4823
MitoTracker_WT_Untreated: 4823
TIA1_WT_Untreated: 4790
PSD95_WT_Untreated: 4004
ANAX11_WT_Untreated: 4464
precision recall f1-score support
0 0.91 0.95 0.93 2123
1 0.97 0.97 0.97 2536
2 0.95 0.95 0.95 2079
3 1.00 1.00 1.00 24823
4 0.88 0.87 0.87 2319
5 0.89 0.86 0.87 2608
6 0.97 0.92 0.94 2236
7 0.97 0.98 0.97 2265
8 0.97 0.98 0.98 2110
9 0.89 0.90 0.89 2104
10 0.94 0.93 0.93 2243
11 0.97 0.99 0.98 2236
12 0.98 0.98 0.98 2227
13 0.96 0.89 0.93 2360
14 0.96 0.95 0.95 1916
15 0.94 0.96 0.95 2074
16 0.88 0.87 0.87 1818
17 0.86 0.82 0.84 1631
18 0.97 0.97 0.97 2090
19 0.98 0.92 0.95 2019
20 0.91 0.93 0.92 1923
21 0.73 0.83 0.77 1654
22 0.89 0.94 0.91 1934
23 0.94 0.94 0.94 2086
24 0.96 0.98 0.97 2114
25 0.99 0.99 0.99 18531
accuracy 0.96 94059
macro avg 0.93 0.93 0.93 94059
weighted avg 0.96 0.96 0.96 94059
Training on Batches: [1, 2], Testing on: [3].
=== Batch [3] ===
Train: (209646, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (87130, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
DAPI_WT_Untreated: 55251
TUJ1_WT_Untreated: 41132
LAMP1_WT_Untreated: 5310
DCP1A_WT_Untreated: 4683
TDP43_WT_Untreated: 4468
P54_WT_Untreated: 4538
SNCA_WT_Untreated: 4377
CLTC_WT_Untreated: 4975
PEX14_WT_Untreated: 4579
PURA_WT_Untreated: 4802
G3BP1_WT_Untreated: 5107
Phalloidin_WT_Untreated: 4238
NEMO_WT_Untreated: 5295
SQSTM1_WT_Untreated: 4305
PML_WT_Untreated: 4115
GM130_WT_Untreated: 4481
Calreticulin_WT_Untreated: 5135
KIF5A_WT_Untreated: 4726
FMRP_WT_Untreated: 5521
NCL_WT_Untreated: 4936
TOMM20_WT_Untreated: 4477
FUS_WT_Untreated: 4964
MitoTracker_WT_Untreated: 4964
TIA1_WT_Untreated: 4798
PSD95_WT_Untreated: 3732
ANAX11_WT_Untreated: 4737
precision recall f1-score support
0 0.90 0.90 0.90 1850
1 0.97 0.62 0.75 2044
2 0.96 0.97 0.96 2332
3 0.99 1.00 1.00 22599
4 0.74 0.80 0.77 1901
5 0.69 0.57 0.62 1492
6 0.97 0.89 0.93 2095
7 0.96 0.95 0.95 2384
8 0.98 0.98 0.98 2145
9 0.97 0.90 0.93 2358
10 0.96 0.96 0.96 2340
11 0.98 0.96 0.97 2095
12 0.95 0.99 0.97 2085
13 0.82 0.99 0.90 2117
14 0.94 0.94 0.94 1751
15 0.97 0.88 0.92 1855
16 0.84 0.55 0.66 1623
17 0.55 0.72 0.62 1903
18 0.88 0.99 0.93 2085
19 0.98 0.93 0.95 2152
20 0.74 0.97 0.84 1857
21 0.69 0.25 0.37 1484
22 0.93 0.94 0.93 1836
23 0.83 0.94 0.89 2078
24 0.97 0.97 0.97 2200
25 0.95 0.99 0.97 16469
accuracy 0.93 87130
macro avg 0.89 0.87 0.87 87130
weighted avg 0.93 0.93 0.92 87130
=== Overall Accuracy ===
0.9499144449661753 [0.9621670257035826, 0.9605247770016692, 0.9270515321932744]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.996651 0.934113 0.998070 0.916580 0.998504
CLTC_WT_Untreated 0.995879 0.860521 0.999158 0.961171 0.996630
Calreticulin_WT_Untreated 0.997557 0.953127 0.998704 0.949947 0.998790
DAPI_WT_Untreated 0.999431 0.999961 0.999242 0.997872 0.999986
DCP1A_WT_Untreated 0.993278 0.861634 0.996265 0.839574 0.996859
FMRP_WT_Untreated 0.992240 0.809497 0.996663 0.854455 0.995395
FUS_WT_Untreated 0.997685 0.933418 0.999251 0.968116 0.998379
G3BP1_WT_Untreated 0.998528 0.971699 0.999222 0.970016 0.999267
GM130_WT_Untreated 0.998912 0.978117 0.999387 0.973269 0.999500
KIF5A_WT_Untreated 0.994413 0.898927 0.996748 0.871135 0.997526
LAMP1_WT_Untreated 0.997409 0.945882 0.998772 0.953234 0.998568
MitoTracker_WT_Untreated 0.998932 0.975776 0.999496 0.979244 0.999410
NCL_WT_Untreated 0.999073 0.982766 0.999469 0.978168 0.999582
NEMO_WT_Untreated 0.996624 0.953724 0.997723 0.914726 0.998813
P54_WT_Untreated 0.998137 0.954206 0.999088 0.957708 0.999009
PEX14_WT_Untreated 0.997598 0.942648 0.998815 0.946325 0.998729
PML_WT_Untreated 0.993274 0.790694 0.997268 0.850900 0.995879
PSD95_WT_Untreated 0.990666 0.784383 0.994659 0.739749 0.995822
PURA_WT_Untreated 0.998056 0.969508 0.998734 0.947899 0.999275
Phalloidin_WT_Untreated 0.997564 0.944757 0.998726 0.942251 0.998784
SNCA_WT_Untreated 0.995027 0.905839 0.996940 0.863984 0.997978
SQSTM1_WT_Untreated 0.989160 0.632234 0.996261 0.770851 0.992710
TDP43_WT_Untreated 0.997119 0.940990 0.998337 0.924708 0.998719
TIA1_WT_Untreated 0.996155 0.931646 0.997685 0.905186 0.998378
TOMM20_WT_Untreated 0.998864 0.980081 0.999297 0.969769 0.999541
TUJ1_WT_Untreated 0.994444 0.991146 0.995238 0.980440 0.997862
Macro Average 0.996257 0.916434 0.998047 0.920280 0.998073
run_train_test_split_baseline(
dataset_config,
batches=[1],
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
)
2025-08-20 12:30:33 INFO: [load_embeddings] multiplex=False 2025-08-20 12:30:33 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 12:30:33 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 12:30:33 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen 2025-08-20 12:30:35 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 12:30:36 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 12:30:37 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 12:30:37 INFO: [load_embeddings] embeddings shape: (115587, 192) 2025-08-20 12:30:37 INFO: [load_embeddings] labels shape: (115587,) 2025-08-20 12:30:37 INFO: [load_embeddings] example label: DAPI_WT_Untreated 2025-08-20 12:30:37 INFO: [load_embeddings] paths shape: (115587,)
Train dataset
(92469,) (92469, 192) [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
3: 24342
11: 2182
7: 2274
9: 2098
6: 2182
24: 1890
13: 2348
12: 2167
2: 2445
14: 2098
19: 1775
25: 18081
21: 2121
10: 2453
0: 2091
4: 1891
20: 1963
15: 2004
16: 1838
22: 2027
5: 2330
17: 1681
1: 1951
8: 1897
23: 2170
18: 2170
Test dataset
(23118,) (23118, 192) [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
25: 4520
5: 583
3: 6086
23: 542
24: 473
12: 542
9: 524
19: 444
0: 523
11: 546
7: 568
4: 473
1: 488
13: 587
10: 614
22: 507
15: 501
2: 611
17: 420
6: 546
16: 459
20: 491
21: 530
14: 524
18: 542
8: 474
precision recall f1-score support
0 0.94 0.95 0.94 523
1 0.97 0.95 0.96 488
2 0.95 0.96 0.95 611
3 1.00 1.00 1.00 6086
4 0.90 0.91 0.91 473
5 0.91 0.90 0.91 583
6 0.98 0.97 0.97 546
7 0.98 0.99 0.98 568
8 0.98 0.97 0.97 474
9 0.89 0.87 0.88 524
10 0.95 0.95 0.95 614
11 0.98 0.99 0.98 546
12 0.99 0.99 0.99 542
13 0.97 0.97 0.97 587
14 0.97 0.96 0.97 524
15 0.95 0.96 0.96 501
16 0.89 0.89 0.89 459
17 0.92 0.91 0.92 420
18 0.97 0.98 0.97 542
19 0.98 0.94 0.96 444
20 0.91 0.92 0.92 491
21 0.86 0.87 0.86 530
22 0.93 0.95 0.94 507
23 0.95 0.94 0.94 542
24 0.97 0.99 0.98 473
25 0.99 0.99 0.99 4520
accuracy 0.97 23118
macro avg 0.95 0.95 0.95 23118
weighted avg 0.97 0.97 0.97 23118
Accuracy: 0.9695
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.997405 0.950287 0.998495 0.935970 0.998849
CLTC_WT_Untreated 0.998356 0.948770 0.999426 0.972689 0.998896
Calreticulin_WT_Untreated 0.997578 0.960720 0.998578 0.948304 0.998933
DAPI_WT_Untreated 1.000000 1.000000 1.000000 1.000000 1.000000
DCP1A_WT_Untreated 0.996107 0.911205 0.997880 0.899791 0.998145
FMRP_WT_Untreated 0.995372 0.903945 0.997737 0.911765 0.997516
FUS_WT_Untreated 0.998789 0.965201 0.999601 0.983209 0.999159
G3BP1_WT_Untreated 0.999221 0.989437 0.999468 0.979094 0.999734
GM130_WT_Untreated 0.998875 0.968354 0.999514 0.976596 0.999338
KIF5A_WT_Untreated 0.994593 0.868321 0.997521 0.890411 0.996948
LAMP1_WT_Untreated 0.997578 0.954397 0.998756 0.954397 0.998756
MitoTracker_WT_Untreated 0.999221 0.987179 0.999513 0.980000 0.999690
NCL_WT_Untreated 0.999481 0.987085 0.999779 0.990741 0.999690
NEMO_WT_Untreated 0.998486 0.974446 0.999112 0.966216 0.999334
P54_WT_Untreated 0.998486 0.958015 0.999425 0.974757 0.999027
PEX14_WT_Untreated 0.998097 0.958084 0.998983 0.954274 0.999071
PML_WT_Untreated 0.995718 0.893246 0.997793 0.891304 0.997837
PSD95_WT_Untreated 0.996972 0.911905 0.998546 0.920673 0.998370
PURA_WT_Untreated 0.998702 0.976015 0.999247 0.968864 0.999424
Phalloidin_WT_Untreated 0.998573 0.943694 0.999647 0.981265 0.998898
SNCA_WT_Untreated 0.996410 0.916497 0.998144 0.914634 0.998188
SQSTM1_WT_Untreated 0.993771 0.869811 0.996680 0.860075 0.996944
TDP43_WT_Untreated 0.997318 0.948718 0.998408 0.930368 0.998850
TIA1_WT_Untreated 0.997361 0.939114 0.998760 0.947858 0.998539
TOMM20_WT_Untreated 0.999092 0.987315 0.999338 0.968880 0.999735
TUJ1_WT_Untreated 0.997534 0.994912 0.998172 0.992496 0.998763
Macro Average 0.997658 0.948718 0.998789 0.949794 0.998794
## Baseline
run_baseline_model(
dataset_config= dataset_config,
batches=[1, 2, 3],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=GaussianNB,
classifier_kwargs={},
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:25:56 INFO: [load_embeddings] multiplex=False 2025-08-20 16:25:56 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:25:56 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 16:25:56 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
Loading all batches...
2025-08-20 16:25:59 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:26:00 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:26:00 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:26:01 INFO: [load_embeddings] embeddings shape: (115587, 192) 2025-08-20 16:26:01 INFO: [load_embeddings] labels shape: (115587,) 2025-08-20 16:26:01 INFO: [load_embeddings] example label: DAPI_WT_Untreated 2025-08-20 16:26:01 INFO: [load_embeddings] paths shape: (115587,) 2025-08-20 16:26:01 INFO: [load_embeddings] multiplex=False 2025-08-20 16:26:01 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:26:01 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 16:26:01 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen 2025-08-20 16:26:03 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:26:04 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:26:04 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:26:04 INFO: [load_embeddings] embeddings shape: (94059, 192) 2025-08-20 16:26:04 INFO: [load_embeddings] labels shape: (94059,) 2025-08-20 16:26:04 INFO: [load_embeddings] example label: DCP1A_WT_Untreated 2025-08-20 16:26:04 INFO: [load_embeddings] paths shape: (94059,) 2025-08-20 16:26:04 INFO: [load_embeddings] multiplex=False 2025-08-20 16:26:04 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:26:04 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 16:26:04 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen 2025-08-20 16:26:06 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:26:07 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:26:08 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:26:08 INFO: [load_embeddings] embeddings shape: (87130, 192) 2025-08-20 16:26:08 INFO: [load_embeddings] labels shape: (87130,) 2025-08-20 16:26:08 INFO: [load_embeddings] example label: TUJ1_WT_Untreated 2025-08-20 16:26:08 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].
=== Batch [2] ===
Train: (115587, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (94059, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
precision recall f1-score support
0 0.77 0.92 0.84 2123
1 0.90 0.90 0.90 2536
2 0.77 0.80 0.78 2079
3 1.00 1.00 1.00 24823
4 0.81 0.69 0.74 2319
5 0.77 0.76 0.76 2608
6 0.93 0.89 0.91 2236
7 0.92 0.96 0.94 2265
8 0.97 0.93 0.95 2110
9 0.89 0.74 0.81 2104
10 0.88 0.89 0.88 2243
11 0.80 0.97 0.87 2236
12 0.97 0.93 0.95 2227
13 0.91 0.90 0.90 2360
14 0.93 0.78 0.85 1916
15 0.91 0.94 0.92 2074
16 0.84 0.78 0.81 1818
17 0.78 0.83 0.80 1631
18 0.84 0.95 0.89 2090
19 0.95 0.85 0.89 2019
20 0.70 0.90 0.79 1923
21 0.64 0.69 0.66 1654
22 0.77 0.89 0.83 1934
23 0.86 0.86 0.86 2086
24 0.94 0.95 0.94 2114
25 0.99 0.95 0.97 18531
accuracy 0.92 94059
macro avg 0.86 0.87 0.86 94059
weighted avg 0.92 0.92 0.92 94059
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (115587, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (87130, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
precision recall f1-score support
0 0.66 0.90 0.76 1850
1 0.85 0.69 0.76 2044
2 0.75 0.85 0.79 2332
3 1.00 1.00 1.00 22599
4 0.61 0.58 0.60 1901
5 0.50 0.43 0.46 1492
6 0.95 0.86 0.90 2095
7 0.88 0.93 0.90 2384
8 0.96 0.93 0.94 2145
9 0.94 0.72 0.82 2358
10 0.90 0.92 0.91 2340
11 0.80 0.90 0.85 2095
12 0.92 0.95 0.94 2085
13 0.82 0.97 0.89 2117
14 0.92 0.80 0.86 1751
15 0.92 0.76 0.83 1855
16 0.78 0.44 0.56 1623
17 0.54 0.65 0.59 1903
18 0.67 0.96 0.79 2085
19 0.96 0.83 0.89 2152
20 0.58 0.97 0.72 1857
21 0.49 0.27 0.35 1484
22 0.84 0.88 0.86 1836
23 0.80 0.84 0.82 2078
24 0.95 0.95 0.95 2200
25 0.97 0.91 0.94 16469
accuracy 0.88 87130
macro avg 0.81 0.80 0.80 87130
weighted avg 0.88 0.88 0.88 87130
=== Overall Accuracy ===
0.8964025760808889 [0.9159889005836762, 0.8768162515781017]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.990226 0.910647 0.992010 0.718713 0.997985
CLTC_WT_Untreated 0.992290 0.803275 0.997192 0.881198 0.994910
Calreticulin_WT_Untreated 0.989304 0.826797 0.993359 0.756482 0.995668
DAPI_WT_Untreated 0.998841 0.998313 0.999028 0.997262 0.999402
DCP1A_WT_Untreated 0.985667 0.638863 0.993937 0.715309 0.991410
FMRP_WT_Untreated 0.984971 0.639756 0.992964 0.677953 0.991670
FUS_WT_Untreated 0.995739 0.879704 0.998581 0.938193 0.997059
G3BP1_WT_Untreated 0.995850 0.942353 0.997258 0.900514 0.998480
GM130_WT_Untreated 0.997511 0.929260 0.999152 0.963450 0.998300
KIF5A_WT_Untreated 0.991661 0.728821 0.998297 0.915283 0.993188
LAMP1_WT_Untreated 0.994834 0.903120 0.997214 0.893759 0.997485
MitoTracker_WT_Untreated 0.992864 0.933964 0.994306 0.800673 0.998376
NCL_WT_Untreated 0.997356 0.940631 0.998739 0.947885 0.998553
NEMO_WT_Untreated 0.994724 0.932991 0.996288 0.864267 0.998299
P54_WT_Untreated 0.994497 0.789746 0.998727 0.927611 0.995670
PEX14_WT_Untreated 0.995027 0.854416 0.998144 0.910743 0.996778
PML_WT_Untreated 0.990143 0.619587 0.997316 0.817171 0.992670
PSD95_WT_Untreated 0.986738 0.733447 0.991776 0.639526 0.994682
PURA_WT_Untreated 0.991291 0.953772 0.992176 0.741941 0.998902
Phalloidin_WT_Untreated 0.995347 0.840086 0.999006 0.952174 0.996242
SNCA_WT_Untreated 0.987196 0.934656 0.988315 0.630218 0.998593
SQSTM1_WT_Untreated 0.985369 0.490121 0.994097 0.594052 0.991041
TDP43_WT_Untreated 0.993123 0.887268 0.995373 0.802928 0.997599
TIA1_WT_Untreated 0.992555 0.849424 0.995921 0.830477 0.996456
TOMM20_WT_Untreated 0.997389 0.948540 0.998581 0.942206 0.998745
TUJ1_WT_Untreated 0.983790 0.932200 0.996142 0.983007 0.983966
Macro Average 0.992089 0.840068 0.995919 0.836269 0.995851
{'Accuracy': 0.9920885840836833,
'Sensitivity': 0.8400676286360618,
'Specificity': 0.9959191889847439,
'PPV': 0.8362690088380957,
'NPV': 0.9958511944497294}
## Baseline
run_baseline_model(
dataset_config= dataset_config,
batches=[1, 2, 3],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=RidgeClassifier,
classifier_kwargs={},
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:26:20 INFO: [load_embeddings] multiplex=False 2025-08-20 16:26:20 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:26:20 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 16:26:20 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
Loading all batches...
2025-08-20 16:26:23 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:26:24 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:26:25 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:26:25 INFO: [load_embeddings] embeddings shape: (115587, 192) 2025-08-20 16:26:25 INFO: [load_embeddings] labels shape: (115587,) 2025-08-20 16:26:25 INFO: [load_embeddings] example label: DAPI_WT_Untreated 2025-08-20 16:26:25 INFO: [load_embeddings] paths shape: (115587,) 2025-08-20 16:26:25 INFO: [load_embeddings] multiplex=False 2025-08-20 16:26:25 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:26:25 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 16:26:25 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen 2025-08-20 16:26:27 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:26:28 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:26:28 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:26:29 INFO: [load_embeddings] embeddings shape: (94059, 192) 2025-08-20 16:26:29 INFO: [load_embeddings] labels shape: (94059,) 2025-08-20 16:26:29 INFO: [load_embeddings] example label: DCP1A_WT_Untreated 2025-08-20 16:26:29 INFO: [load_embeddings] paths shape: (94059,) 2025-08-20 16:26:29 INFO: [load_embeddings] multiplex=False 2025-08-20 16:26:29 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:26:29 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 16:26:29 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen 2025-08-20 16:26:31 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:26:31 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:26:32 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:26:32 INFO: [load_embeddings] embeddings shape: (87130, 192) 2025-08-20 16:26:32 INFO: [load_embeddings] labels shape: (87130,) 2025-08-20 16:26:32 INFO: [load_embeddings] example label: TUJ1_WT_Untreated 2025-08-20 16:26:32 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].
=== Batch [2] ===
Train: (115587, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (94059, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
precision recall f1-score support
0 0.83 0.92 0.87 2123
1 0.98 0.77 0.87 2536
2 0.79 0.85 0.82 2079
3 1.00 1.00 1.00 24823
4 0.88 0.74 0.80 2319
5 0.79 0.86 0.82 2608
6 0.95 0.90 0.93 2236
7 0.96 0.96 0.96 2265
8 0.96 0.95 0.96 2110
9 0.88 0.82 0.85 2104
10 0.91 0.89 0.90 2243
11 0.85 0.97 0.91 2236
12 0.98 0.94 0.96 2227
13 0.89 0.93 0.91 2360
14 0.96 0.88 0.92 1916
15 0.92 0.97 0.94 2074
16 0.93 0.76 0.84 1818
17 0.89 0.77 0.82 1631
18 0.89 0.97 0.93 2090
19 0.97 0.71 0.82 2019
20 0.85 0.91 0.88 1923
21 0.77 0.69 0.73 1654
22 0.83 0.92 0.87 1934
23 0.92 0.90 0.91 2086
24 0.94 0.94 0.94 2114
25 0.94 0.99 0.97 18531
accuracy 0.93 94059
macro avg 0.90 0.88 0.89 94059
weighted avg 0.93 0.93 0.93 94059
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (115587, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (87130, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
precision recall f1-score support
0 0.81 0.89 0.85 1850
1 0.95 0.19 0.31 2044
2 0.87 0.90 0.88 2332
3 0.99 1.00 0.99 22599
4 0.65 0.61 0.63 1901
5 0.49 0.46 0.48 1492
6 0.95 0.83 0.89 2095
7 0.94 0.93 0.93 2384
8 0.94 0.94 0.94 2145
9 0.94 0.77 0.84 2358
10 0.93 0.92 0.92 2340
11 0.88 0.96 0.91 2095
12 0.96 0.96 0.96 2085
13 0.76 0.99 0.86 2117
14 0.92 0.87 0.90 1751
15 0.94 0.91 0.93 1855
16 0.87 0.45 0.59 1623
17 0.56 0.61 0.58 1903
18 0.77 0.98 0.86 2085
19 0.97 0.75 0.85 2152
20 0.65 0.89 0.75 1857
21 0.66 0.27 0.38 1484
22 0.87 0.91 0.89 1836
23 0.84 0.88 0.86 2078
24 0.94 0.95 0.95 2200
25 0.88 0.98 0.93 16469
accuracy 0.89 87130
macro avg 0.84 0.80 0.80 87130
weighted avg 0.89 0.89 0.88 87130
=== Overall Accuracy ===
0.9085961175275523 [0.9313090719654685, 0.8858831630896362]
Exception ignored in: <cyfunction RandomForestClassifier.__del__ at 0x14bc241b8a00> Traceback (most recent call last): File "randomforestclassifier.pyx", line 317, in cuml.ensemble.randomforestclassifier.RandomForestClassifier.__del__ File "randomforestclassifier.pyx", line 321, in cuml.ensemble.randomforestclassifier.RandomForestClassifier._reset_forest_data File "base.pyx", line 330, in cuml.internals.base.Base.__getattr__ AttributeError: rf_forest
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.993614 0.903851 0.995627 0.822492 0.997840
CLTC_WT_Untreated 0.987345 0.511354 0.999689 0.977055 0.987483
Calreticulin_WT_Untreated 0.992671 0.876672 0.995565 0.831434 0.996919
DAPI_WT_Untreated 0.997577 1.000000 0.996718 0.990828 1.000000
DCP1A_WT_Untreated 0.987913 0.684597 0.995146 0.770811 0.992499
FMRP_WT_Untreated 0.986219 0.717317 0.992444 0.687310 0.993449
FUS_WT_Untreated 0.995783 0.865851 0.998965 0.953471 0.996722
G3BP1_WT_Untreated 0.997279 0.944504 0.998669 0.949200 0.998539
GM130_WT_Untreated 0.997687 0.947826 0.998887 0.953428 0.998745
KIF5A_WT_Untreated 0.992886 0.789556 0.998020 0.909631 0.994704
LAMP1_WT_Untreated 0.995601 0.906830 0.997905 0.918250 0.997583
MitoTracker_WT_Untreated 0.995563 0.965828 0.996291 0.864435 0.999161
NCL_WT_Untreated 0.998162 0.950603 0.999322 0.971557 0.998796
NEMO_WT_Untreated 0.993808 0.958008 0.994715 0.821176 0.998932
P54_WT_Untreated 0.996330 0.875375 0.998828 0.939146 0.997429
PEX14_WT_Untreated 0.997207 0.941206 0.998449 0.930783 0.998697
PML_WT_Untreated 0.991517 0.614066 0.998824 0.909991 0.992575
PSD95_WT_Untreated 0.987875 0.680249 0.993994 0.692596 0.993642
PURA_WT_Untreated 0.994624 0.977006 0.995040 0.822877 0.999455
Phalloidin_WT_Untreated 0.993394 0.734836 0.999486 0.971166 0.993788
SNCA_WT_Untreated 0.991241 0.898942 0.993208 0.738214 0.997837
SQSTM1_WT_Untreated 0.988150 0.491077 0.996911 0.736968 0.991083
TDP43_WT_Untreated 0.994801 0.916711 0.996460 0.846229 0.998227
TIA1_WT_Untreated 0.994525 0.888569 0.997017 0.875118 0.997378
TOMM20_WT_Untreated 0.997373 0.945526 0.998637 0.944213 0.998671
TUJ1_WT_Untreated 0.979784 0.987600 0.977912 0.914565 0.996973
Macro Average 0.993036 0.845152 0.996259 0.874729 0.996428
{'Accuracy': 0.9930357463541044,
'Sensitivity': 0.8451522565708357,
'Specificity': 0.9962587823367731,
'PPV': 0.8747285842320065,
'NPV': 0.9964279364711918}
## Baseline
run_baseline_model(
dataset_config= dataset_config,
batches=[1, 2, 3],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=LinearSVC,
classifier_kwargs={"C": 1.0, "max_iter": 1000, "random_state": 42},
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:26:37 INFO: [load_embeddings] multiplex=False 2025-08-20 16:26:37 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:26:37 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 16:26:37 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
Loading all batches...
2025-08-20 16:26:39 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:26:40 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:26:41 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:26:41 INFO: [load_embeddings] embeddings shape: (115587, 192) 2025-08-20 16:26:41 INFO: [load_embeddings] labels shape: (115587,) 2025-08-20 16:26:41 INFO: [load_embeddings] example label: DAPI_WT_Untreated 2025-08-20 16:26:41 INFO: [load_embeddings] paths shape: (115587,) 2025-08-20 16:26:41 INFO: [load_embeddings] multiplex=False 2025-08-20 16:26:41 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:26:41 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 16:26:41 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen 2025-08-20 16:26:43 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:26:44 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:26:45 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:26:45 INFO: [load_embeddings] embeddings shape: (94059, 192) 2025-08-20 16:26:45 INFO: [load_embeddings] labels shape: (94059,) 2025-08-20 16:26:45 INFO: [load_embeddings] example label: DCP1A_WT_Untreated 2025-08-20 16:26:45 INFO: [load_embeddings] paths shape: (94059,) 2025-08-20 16:26:45 INFO: [load_embeddings] multiplex=False 2025-08-20 16:26:45 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:26:45 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 16:26:45 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen 2025-08-20 16:26:47 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:26:48 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:26:48 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:26:48 INFO: [load_embeddings] embeddings shape: (87130, 192) 2025-08-20 16:26:48 INFO: [load_embeddings] labels shape: (87130,) 2025-08-20 16:26:48 INFO: [load_embeddings] example label: TUJ1_WT_Untreated 2025-08-20 16:26:48 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded. Training on Batches: [1], Testing on: [2]. === Batch [2] === Train: (115587, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25] Test: (94059, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25] DAPI_WT_Untreated: 30428 TUJ1_WT_Untreated: 22601 LAMP1_WT_Untreated: 3067 DCP1A_WT_Untreated: 2364 TDP43_WT_Untreated: 2534 P54_WT_Untreated: 2622 SNCA_WT_Untreated: 2454 CLTC_WT_Untreated: 2439 PEX14_WT_Untreated: 2505 PURA_WT_Untreated: 2712 G3BP1_WT_Untreated: 2842 Phalloidin_WT_Untreated: 2219 NEMO_WT_Untreated: 2935 SQSTM1_WT_Untreated: 2651 PML_WT_Untreated: 2297 GM130_WT_Untreated: 2371 Calreticulin_WT_Untreated: 3056 KIF5A_WT_Untreated: 2622 FMRP_WT_Untreated: 2913 NCL_WT_Untreated: 2709 TOMM20_WT_Untreated: 2363 FUS_WT_Untreated: 2728 MitoTracker_WT_Untreated: 2728 TIA1_WT_Untreated: 2712 PSD95_WT_Untreated: 2101 ANAX11_WT_Untreated: 2614
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning. warnings.warn(
precision recall f1-score support
0 0.92 0.96 0.94 2123
1 0.99 0.97 0.98 2536
2 0.96 0.96 0.96 2079
3 1.00 1.00 1.00 24823
4 0.92 0.86 0.89 2319
5 0.89 0.91 0.90 2608
6 0.99 0.94 0.96 2236
7 0.98 0.98 0.98 2265
8 0.98 0.98 0.98 2110
9 0.94 0.88 0.91 2104
10 0.95 0.94 0.95 2243
11 0.97 0.99 0.98 2236
12 0.99 1.00 0.99 2227
13 0.97 0.94 0.95 2360
14 0.96 0.95 0.96 1916
15 0.96 0.97 0.97 2074
16 0.93 0.88 0.91 1818
17 0.87 0.89 0.88 1631
18 0.95 0.98 0.97 2090
19 0.99 0.91 0.95 2019
20 0.90 0.97 0.94 1923
21 0.79 0.84 0.81 1654
22 0.91 0.94 0.93 1934
23 0.93 0.95 0.94 2086
24 0.97 0.98 0.97 2114
25 0.99 1.00 0.99 18531
accuracy 0.97 94059
macro avg 0.95 0.95 0.95 94059
weighted avg 0.97 0.97 0.97 94059
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (115587, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (87130, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning. warnings.warn(
precision recall f1-score support
0 0.88 0.91 0.89 1850
1 0.97 0.61 0.75 2044
2 0.96 0.97 0.97 2332
3 1.00 1.00 1.00 22599
4 0.74 0.77 0.75 1901
5 0.67 0.58 0.62 1492
6 0.98 0.90 0.94 2095
7 0.95 0.97 0.96 2384
8 0.97 0.98 0.98 2145
9 0.97 0.83 0.90 2358
10 0.96 0.96 0.96 2340
11 0.98 0.96 0.97 2095
12 0.96 0.99 0.98 2085
13 0.88 1.00 0.94 2117
14 0.93 0.94 0.94 1751
15 0.97 0.90 0.94 1855
16 0.87 0.55 0.68 1623
17 0.55 0.73 0.63 1903
18 0.86 0.99 0.92 2085
19 0.98 0.92 0.95 2152
20 0.72 0.98 0.83 1857
21 0.58 0.26 0.36 1484
22 0.94 0.92 0.93 1836
23 0.82 0.95 0.88 2078
24 0.97 0.98 0.97 2200
25 0.95 0.99 0.97 16469
accuracy 0.93 87130
macro avg 0.89 0.87 0.87 87130
weighted avg 0.93 0.93 0.92 87130
=== Overall Accuracy ===
0.9474422945683487 [0.9680625990070062, 0.9268219901296912]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.996335 0.932796 0.997760 0.903242 0.998492
CLTC_WT_Untreated 0.994867 0.811572 0.999621 0.982294 0.995135
Calreticulin_WT_Untreated 0.998162 0.964861 0.998993 0.959856 0.999123
DAPI_WT_Untreated 0.999349 1.000000 0.999118 0.997518 1.000000
DCP1A_WT_Untreated 0.991931 0.818246 0.996073 0.832449 0.995668
FMRP_WT_Untreated 0.991324 0.792927 0.995917 0.818067 0.995209
FUS_WT_Untreated 0.997798 0.921727 0.999661 0.985192 0.998086
G3BP1_WT_Untreated 0.998438 0.976554 0.999014 0.963089 0.999382
GM130_WT_Untreated 0.999023 0.980964 0.999457 0.977518 0.999542
KIF5A_WT_Untreated 0.995452 0.855670 0.998981 0.954977 0.996366
LAMP1_WT_Untreated 0.997715 0.952215 0.998896 0.957227 0.998760
MitoTracker_WT_Untreated 0.998852 0.978065 0.999361 0.974017 0.999463
NCL_WT_Untreated 0.999299 0.994666 0.999412 0.976326 0.999870
NEMO_WT_Untreated 0.997218 0.966719 0.997991 0.924194 0.999156
P54_WT_Untreated 0.997842 0.945460 0.998924 0.947786 0.998873
PEX14_WT_Untreated 0.997997 0.937643 0.999334 0.968964 0.998619
PML_WT_Untreated 0.993460 0.727695 0.998605 0.909884 0.994749
PSD95_WT_Untreated 0.988658 0.805603 0.992300 0.675445 0.996118
PURA_WT_Untreated 0.997285 0.987066 0.997526 0.903926 0.999694
Phalloidin_WT_Untreated 0.997687 0.913210 0.999678 0.985256 0.997959
SNCA_WT_Untreated 0.994464 0.973545 0.994910 0.802967 0.999434
SQSTM1_WT_Untreated 0.988885 0.563735 0.996377 0.732809 0.992342
TDP43_WT_Untreated 0.997003 0.932361 0.998377 0.924270 0.998562
TIA1_WT_Untreated 0.995684 0.951249 0.996729 0.872467 0.998851
TOMM20_WT_Untreated 0.998697 0.976356 0.999242 0.969167 0.999423
TUJ1_WT_Untreated 0.993035 0.991257 0.993461 0.973184 0.997897
Macro Average 0.996018 0.909699 0.997912 0.918157 0.997953
{'Accuracy': 0.9960177579127958,
'Sensitivity': 0.9096985119886605,
'Specificity': 0.9979122458246539,
'PPV': 0.9181573382480437,
'NPV': 0.997952869605}
run_baseline_model(
dataset_config,
batches=[1,2,3,],
classifier_class=cuRF,
classifier_kwargs={"n_estimators": 300, "random_state": 42}, # max_depth=0 => unlimited in cuML
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:27:33 INFO: [load_embeddings] multiplex=False 2025-08-20 16:27:33 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:27:33 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 16:27:33 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
Loading all batches...
2025-08-20 16:27:36 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:27:37 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:27:37 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:27:38 INFO: [load_embeddings] embeddings shape: (115587, 192) 2025-08-20 16:27:38 INFO: [load_embeddings] labels shape: (115587,) 2025-08-20 16:27:38 INFO: [load_embeddings] example label: DAPI_WT_Untreated 2025-08-20 16:27:38 INFO: [load_embeddings] paths shape: (115587,) 2025-08-20 16:27:38 INFO: [load_embeddings] multiplex=False 2025-08-20 16:27:38 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:27:38 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 16:27:38 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen 2025-08-20 16:27:40 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:27:41 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:27:41 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:27:42 INFO: [load_embeddings] embeddings shape: (94059, 192) 2025-08-20 16:27:42 INFO: [load_embeddings] labels shape: (94059,) 2025-08-20 16:27:42 INFO: [load_embeddings] example label: DCP1A_WT_Untreated 2025-08-20 16:27:42 INFO: [load_embeddings] paths shape: (94059,) 2025-08-20 16:27:42 INFO: [load_embeddings] multiplex=False 2025-08-20 16:27:42 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:27:42 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 16:27:42 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen 2025-08-20 16:27:44 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:27:45 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:27:45 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:27:46 INFO: [load_embeddings] embeddings shape: (87130, 192) 2025-08-20 16:27:46 INFO: [load_embeddings] labels shape: (87130,) 2025-08-20 16:27:46 INFO: [load_embeddings] example label: TUJ1_WT_Untreated 2025-08-20 16:27:46 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded. Training on Batches: [1], Testing on: [2]. === Batch [2] === Train: (115587, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25] Test: (94059, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25] DAPI_WT_Untreated: 30428 TUJ1_WT_Untreated: 22601 LAMP1_WT_Untreated: 3067 DCP1A_WT_Untreated: 2364 TDP43_WT_Untreated: 2534 P54_WT_Untreated: 2622 SNCA_WT_Untreated: 2454 CLTC_WT_Untreated: 2439 PEX14_WT_Untreated: 2505 PURA_WT_Untreated: 2712 G3BP1_WT_Untreated: 2842 Phalloidin_WT_Untreated: 2219 NEMO_WT_Untreated: 2935 SQSTM1_WT_Untreated: 2651 PML_WT_Untreated: 2297 GM130_WT_Untreated: 2371 Calreticulin_WT_Untreated: 3056 KIF5A_WT_Untreated: 2622 FMRP_WT_Untreated: 2913 NCL_WT_Untreated: 2709 TOMM20_WT_Untreated: 2363 FUS_WT_Untreated: 2728 MitoTracker_WT_Untreated: 2728 TIA1_WT_Untreated: 2712 PSD95_WT_Untreated: 2101 ANAX11_WT_Untreated: 2614
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/cuml/internals/api_decorators.py:344: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set return func(**kwargs)
precision recall f1-score support
0 0.87 0.93 0.90 2123
1 0.98 0.91 0.95 2536
2 0.92 0.90 0.91 2079
3 1.00 1.00 1.00 24823
4 0.85 0.77 0.81 2319
5 0.80 0.86 0.83 2608
6 0.98 0.91 0.95 2236
7 0.97 0.97 0.97 2265
8 0.98 0.97 0.98 2110
9 0.93 0.82 0.87 2104
10 0.91 0.93 0.92 2243
11 0.92 0.99 0.95 2236
12 0.98 0.99 0.98 2227
13 0.95 0.93 0.94 2360
14 0.95 0.89 0.92 1916
15 0.94 0.95 0.94 2074
16 0.93 0.84 0.88 1818
17 0.86 0.86 0.86 1631
18 0.92 0.98 0.95 2090
19 0.99 0.83 0.90 2019
20 0.82 0.94 0.87 1923
21 0.74 0.78 0.76 1654
22 0.85 0.92 0.88 1934
23 0.91 0.92 0.92 2086
24 0.97 0.96 0.96 2114
25 0.98 0.99 0.99 18531
accuracy 0.95 94059
macro avg 0.92 0.91 0.91 94059
weighted avg 0.95 0.95 0.95 94059
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (115587, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (87130, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/cuml/internals/api_decorators.py:344: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set return func(**kwargs)
precision recall f1-score support
0 0.82 0.90 0.86 1850
1 0.96 0.43 0.59 2044
2 0.91 0.94 0.93 2332
3 0.99 1.00 1.00 22599
4 0.66 0.68 0.67 1901
5 0.54 0.51 0.52 1492
6 0.98 0.87 0.92 2095
7 0.94 0.93 0.94 2384
8 0.97 0.97 0.97 2145
9 0.95 0.77 0.85 2358
10 0.94 0.95 0.95 2340
11 0.94 0.95 0.94 2095
12 0.94 0.99 0.97 2085
13 0.84 0.99 0.91 2117
14 0.94 0.88 0.91 1751
15 0.97 0.87 0.91 1855
16 0.86 0.49 0.63 1623
17 0.56 0.68 0.61 1903
18 0.79 0.99 0.88 2085
19 0.99 0.84 0.91 2152
20 0.66 0.97 0.79 1857
21 0.62 0.29 0.39 1484
22 0.89 0.92 0.91 1836
23 0.82 0.91 0.86 2078
24 0.97 0.96 0.97 2200
25 0.92 0.98 0.95 16469
accuracy 0.91 87130
macro avg 0.86 0.83 0.84 87130
weighted avg 0.91 0.91 0.90 87130
=== Overall Accuracy ===
0.9268782814899591 [0.9486492520651931, 0.9051073109147251]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.994591 0.914422 0.996389 0.850222 0.998078
CLTC_WT_Untreated 0.991953 0.697380 0.999592 0.977955 0.992210
Calreticulin_WT_Untreated 0.995960 0.921333 0.997822 0.913464 0.998037
DAPI_WT_Untreated 0.999073 0.999979 0.998752 0.996491 0.999993
DCP1A_WT_Untreated 0.988244 0.731043 0.994378 0.756127 0.993592
FMRP_WT_Untreated 0.987218 0.728780 0.993201 0.712786 0.993717
FUS_WT_Untreated 0.997003 0.892173 0.999570 0.980711 0.997365
G3BP1_WT_Untreated 0.997511 0.950097 0.998759 0.952761 0.998686
GM130_WT_Untreated 0.998653 0.968508 0.999378 0.974001 0.999243
KIF5A_WT_Untreated 0.993675 0.791573 0.998778 0.942369 0.994759
LAMP1_WT_Untreated 0.996584 0.941523 0.998013 0.924775 0.998482
MitoTracker_WT_Untreated 0.997434 0.966290 0.998196 0.929174 0.999174
NCL_WT_Untreated 0.998786 0.988173 0.999045 0.961851 0.999711
NEMO_WT_Untreated 0.996131 0.957338 0.997114 0.893661 0.998917
P54_WT_Untreated 0.996655 0.887647 0.998907 0.943752 0.997682
PEX14_WT_Untreated 0.997064 0.910664 0.998979 0.951849 0.998022
PML_WT_Untreated 0.992527 0.675385 0.998667 0.907458 0.993747
PSD95_WT_Untreated 0.988559 0.758630 0.993133 0.687260 0.995189
PURA_WT_Untreated 0.995590 0.982994 0.995887 0.849338 0.999597
Phalloidin_WT_Untreated 0.995982 0.837209 0.999723 0.986162 0.996178
SNCA_WT_Untreated 0.991821 0.956085 0.992582 0.733063 0.999058
SQSTM1_WT_Untreated 0.988222 0.546845 0.996001 0.706755 0.992045
TDP43_WT_Untreated 0.995447 0.920955 0.997030 0.868217 0.998318
TIA1_WT_Untreated 0.994812 0.915466 0.996678 0.866364 0.998009
TOMM20_WT_Untreated 0.998273 0.958507 0.999242 0.968611 0.998988
TUJ1_WT_Untreated 0.987654 0.986914 0.987831 0.951020 0.996839
Macro Average 0.994439 0.876381 0.997063 0.891777 0.997140
{'Accuracy': 0.9944392956441148,
'Sensitivity': 0.8763812358081148,
'Specificity': 0.9970633461347483,
'PPV': 0.8917767791645945,
'NPV': 0.9971398095115489}
run_baseline_model(dataset_config,
batches=[1,2,3],
classifier_class=ExtraTreesClassifier,
classifier_kwargs={"n_estimators": 300, "max_depth": None, "min_samples_leaf": 1,
"n_jobs": -1, "random_state": 42},
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:28:08 INFO: [load_embeddings] multiplex=False 2025-08-20 16:28:08 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:28:08 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 16:28:08 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen
Loading all batches...
2025-08-20 16:28:10 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:28:11 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:28:12 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:28:13 INFO: [load_embeddings] embeddings shape: (115587, 192) 2025-08-20 16:28:13 INFO: [load_embeddings] labels shape: (115587,) 2025-08-20 16:28:13 INFO: [load_embeddings] example label: DAPI_WT_Untreated 2025-08-20 16:28:13 INFO: [load_embeddings] paths shape: (115587,) 2025-08-20 16:28:13 INFO: [load_embeddings] multiplex=False 2025-08-20 16:28:13 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:28:13 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 16:28:13 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen 2025-08-20 16:28:15 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:28:15 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:28:16 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:28:16 INFO: [load_embeddings] embeddings shape: (94059, 192) 2025-08-20 16:28:16 INFO: [load_embeddings] labels shape: (94059,) 2025-08-20 16:28:16 INFO: [load_embeddings] example label: DCP1A_WT_Untreated 2025-08-20 16:28:16 INFO: [load_embeddings] paths shape: (94059,) 2025-08-20 16:28:16 INFO: [load_embeddings] multiplex=False 2025-08-20 16:28:16 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:28:16 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 16:28:16 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/finetunedModel_MLPHead_acrossBatches_B56789_80pct_frozen 2025-08-20 16:28:18 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:28:19 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:28:19 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:28:20 INFO: [load_embeddings] embeddings shape: (87130, 192) 2025-08-20 16:28:20 INFO: [load_embeddings] labels shape: (87130,) 2025-08-20 16:28:20 INFO: [load_embeddings] example label: TUJ1_WT_Untreated 2025-08-20 16:28:20 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].
=== Batch [2] ===
Train: (115587, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (94059, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
precision recall f1-score support
0 0.90 0.94 0.92 2123
1 0.99 0.92 0.95 2536
2 0.91 0.92 0.92 2079
3 1.00 1.00 1.00 24823
4 0.88 0.81 0.84 2319
5 0.84 0.87 0.86 2608
6 0.98 0.93 0.95 2236
7 0.97 0.97 0.97 2265
8 0.98 0.98 0.98 2110
9 0.93 0.86 0.89 2104
10 0.92 0.93 0.92 2243
11 0.93 0.99 0.96 2236
12 0.98 0.99 0.98 2227
13 0.95 0.94 0.94 2360
14 0.96 0.91 0.93 1916
15 0.94 0.96 0.95 2074
16 0.94 0.86 0.90 1818
17 0.87 0.87 0.87 1631
18 0.92 0.98 0.95 2090
19 0.99 0.83 0.90 2019
20 0.85 0.95 0.90 1923
21 0.78 0.79 0.79 1654
22 0.87 0.93 0.90 1934
23 0.93 0.93 0.93 2086
24 0.96 0.96 0.96 2114
25 0.98 0.99 0.99 18531
accuracy 0.95 94059
macro avg 0.93 0.92 0.93 94059
weighted avg 0.96 0.95 0.95 94059
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (115587, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (87130, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
DAPI_WT_Untreated: 30428
TUJ1_WT_Untreated: 22601
LAMP1_WT_Untreated: 3067
DCP1A_WT_Untreated: 2364
TDP43_WT_Untreated: 2534
P54_WT_Untreated: 2622
SNCA_WT_Untreated: 2454
CLTC_WT_Untreated: 2439
PEX14_WT_Untreated: 2505
PURA_WT_Untreated: 2712
G3BP1_WT_Untreated: 2842
Phalloidin_WT_Untreated: 2219
NEMO_WT_Untreated: 2935
SQSTM1_WT_Untreated: 2651
PML_WT_Untreated: 2297
GM130_WT_Untreated: 2371
Calreticulin_WT_Untreated: 3056
KIF5A_WT_Untreated: 2622
FMRP_WT_Untreated: 2913
NCL_WT_Untreated: 2709
TOMM20_WT_Untreated: 2363
FUS_WT_Untreated: 2728
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
ANAX11_WT_Untreated: 2614
precision recall f1-score support
0 0.85 0.92 0.88 1850
1 0.97 0.42 0.58 2044
2 0.91 0.96 0.93 2332
3 0.99 1.00 1.00 22599
4 0.68 0.72 0.70 1901
5 0.61 0.54 0.57 1492
6 0.98 0.88 0.93 2095
7 0.94 0.94 0.94 2384
8 0.97 0.97 0.97 2145
9 0.97 0.82 0.89 2358
10 0.95 0.95 0.95 2340
11 0.94 0.95 0.95 2095
12 0.95 0.99 0.97 2085
13 0.85 0.99 0.91 2117
14 0.95 0.90 0.92 1751
15 0.96 0.88 0.92 1855
16 0.88 0.52 0.65 1623
17 0.57 0.68 0.62 1903
18 0.80 0.99 0.89 2085
19 0.99 0.84 0.91 2152
20 0.68 0.97 0.80 1857
21 0.68 0.29 0.41 1484
22 0.91 0.93 0.92 1836
23 0.85 0.91 0.88 2078
24 0.97 0.97 0.97 2200
25 0.92 0.98 0.95 16469
accuracy 0.91 87130
macro avg 0.87 0.84 0.85 87130
weighted avg 0.91 0.91 0.91 87130
=== Overall Accuracy ===
0.9330128509768199 [0.9547092782189902, 0.9113164237346494]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.995535 0.931034 0.996981 0.873642 0.998452
CLTC_WT_Untreated 0.991959 0.694105 0.999683 0.982689 0.992127
Calreticulin_WT_Untreated 0.996330 0.938336 0.997777 0.913283 0.998460
DAPI_WT_Untreated 0.999073 0.999979 0.998752 0.996491 0.999993
DCP1A_WT_Untreated 0.989696 0.768483 0.994971 0.784660 0.994482
FMRP_WT_Untreated 0.989149 0.751707 0.994647 0.764764 0.994254
FUS_WT_Untreated 0.997318 0.904179 0.999599 0.982192 0.997658
G3BP1_WT_Untreated 0.997699 0.956765 0.998776 0.953688 0.998861
GM130_WT_Untreated 0.998758 0.974618 0.999339 0.972561 0.999390
KIF5A_WT_Untreated 0.994840 0.834827 0.998880 0.949528 0.995842
LAMP1_WT_Untreated 0.996777 0.939996 0.998250 0.933073 0.998443
MitoTracker_WT_Untreated 0.997710 0.969060 0.998411 0.937249 0.999242
NCL_WT_Untreated 0.998935 0.989100 0.999175 0.966901 0.999734
NEMO_WT_Untreated 0.996402 0.965602 0.997182 0.896702 0.999127
P54_WT_Untreated 0.997158 0.904281 0.999076 0.952874 0.998025
PEX14_WT_Untreated 0.997235 0.919827 0.998951 0.951053 0.998224
PML_WT_Untreated 0.993140 0.697472 0.998864 0.922367 0.994171
PSD95_WT_Untreated 0.988835 0.769949 0.993189 0.692190 0.995413
PURA_WT_Untreated 0.995943 0.985150 0.996198 0.859382 0.999649
Phalloidin_WT_Untreated 0.995993 0.835291 0.999780 0.988930 0.996133
SNCA_WT_Untreated 0.992649 0.960317 0.993337 0.754364 0.999150
SQSTM1_WT_Untreated 0.989144 0.555449 0.996787 0.752916 0.992201
TDP43_WT_Untreated 0.996120 0.932626 0.997469 0.886759 0.998567
TIA1_WT_Untreated 0.995414 0.920029 0.997187 0.884962 0.998117
TOMM20_WT_Untreated 0.998333 0.962911 0.999197 0.966946 0.999095
TUJ1_WT_Untreated 0.987543 0.988229 0.987379 0.949359 0.997154
Macro Average 0.994911 0.886512 0.997301 0.902674 0.997383
{'Accuracy': 0.9949109663220342,
'Sensitivity': 0.8865124718026224,
'Specificity': 0.9973013872850368,
'PPV': 0.9026740505806359,
'NPV': 0.9973831966631587}
Cytoself_dataset_config = {
"path_to_embeddings": "/home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/",
"multiplexed": False,
"config_fmt": "NIH_UMAP1_DatasetConfig_B{batch}",
"config_dir": "manuscript/manuscript_figures_data_config",
}
## Baseline
run_baseline_model(
dataset_config= Cytoself_dataset_config,
batches=[1, 2, 3],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:32:51 INFO: [load_embeddings] multiplex=False 2025-08-20 16:32:51 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:32:51 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 16:32:51 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
Loading all batches...
2025-08-20 16:32:58 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:33:01 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:33:02 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:33:03 INFO: [load_embeddings] embeddings shape: (112878, 2048) 2025-08-20 16:33:03 INFO: [load_embeddings] labels shape: (112878,) 2025-08-20 16:33:03 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:33:03 INFO: [load_embeddings] paths shape: (112878,) 2025-08-20 16:33:03 INFO: [load_embeddings] multiplex=False 2025-08-20 16:33:03 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:33:03 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 16:33:03 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/ 2025-08-20 16:33:09 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:33:11 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:33:13 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:33:13 INFO: [load_embeddings] embeddings shape: (91973, 2048) 2025-08-20 16:33:13 INFO: [load_embeddings] labels shape: (91973,) 2025-08-20 16:33:13 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:33:13 INFO: [load_embeddings] paths shape: (91973,) 2025-08-20 16:33:13 INFO: [load_embeddings] multiplex=False 2025-08-20 16:33:13 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:33:13 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 16:33:13 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/ 2025-08-20 16:33:19 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:33:21 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:33:23 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:33:23 INFO: [load_embeddings] embeddings shape: (85052, 2048) 2025-08-20 16:33:23 INFO: [load_embeddings] labels shape: (85052,) 2025-08-20 16:33:23 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:33:23 INFO: [load_embeddings] paths shape: (85052,)
Batches loaded.
Training on Batches: [1], Testing on: [2].
=== Batch [2] ===
Train: (112878, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
Test: (91973, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
[W] [16:33:37.459943] L-BFGS: max iterations reached
[W] [16:33:37.464982] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
precision recall f1-score support
0 0.94 0.96 0.95 2123
1 0.98 0.97 0.97 2536
2 0.98 0.96 0.97 2079
3 1.00 1.00 1.00 24823
4 0.87 0.78 0.82 2319
5 0.81 0.85 0.83 2608
6 0.96 0.94 0.95 2236
7 0.97 0.98 0.97 2265
8 0.94 0.95 0.94 2110
9 0.86 0.82 0.84 2104
10 0.95 0.97 0.96 2243
11 0.98 0.99 0.99 2236
12 0.99 0.99 0.99 2227
13 0.96 0.88 0.92 2360
14 0.94 0.94 0.94 1916
15 0.98 0.97 0.97 2074
16 0.85 0.83 0.84 1818
17 0.81 0.84 0.82 1631
18 0.96 0.99 0.97 2090
19 0.98 0.92 0.95 2019
20 0.85 0.95 0.90 1923
21 0.67 0.74 0.70 1654
22 0.92 0.92 0.92 1934
23 0.97 0.96 0.97 2114
24 0.99 1.00 1.00 18531
accuracy 0.96 91973
macro avg 0.92 0.92 0.92 91973
weighted avg 0.96 0.96 0.96 91973
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (112878, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
Test: (85052, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
[W] [16:33:51.947469] L-BFGS: max iterations reached
[W] [16:33:51.948901] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
precision recall f1-score support
0 0.81 0.95 0.87 1850
1 0.97 0.64 0.77 2044
2 0.97 0.95 0.96 2332
3 1.00 1.00 1.00 22599
4 0.73 0.65 0.69 1901
5 0.54 0.64 0.58 1492
6 0.94 0.89 0.92 2095
7 0.89 0.95 0.92 2384
8 0.92 0.94 0.93 2145
9 0.91 0.70 0.79 2358
10 0.88 0.95 0.92 2340
11 0.97 0.94 0.95 2095
12 0.95 0.99 0.97 2085
13 0.88 0.97 0.92 2117
14 0.93 0.92 0.92 1751
15 0.95 0.75 0.84 1855
16 0.74 0.49 0.59 1623
17 0.52 0.61 0.56 1903
18 0.91 0.98 0.94 2085
19 0.97 0.91 0.94 2152
20 0.65 0.94 0.77 1857
21 0.46 0.30 0.37 1484
22 0.91 0.92 0.91 1836
23 0.95 0.97 0.96 2200
24 0.96 0.99 0.98 16469
accuracy 0.91 85052
macro avg 0.85 0.84 0.84 85052
weighted avg 0.91 0.91 0.91 85052
=== Overall Accuracy ===
0.9338983849888869 [0.9565307209724593, 0.9112660490053144]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.995899 0.955701 0.996822 0.873476 0.998981
CLTC_WT_Untreated 0.994826 0.820524 0.999455 0.975597 0.995253
Calreticulin_WT_Untreated 0.998260 0.955112 0.999363 0.974555 0.998854
DAPI_WT_Untreated 0.999486 0.999831 0.999360 0.998253 0.999938
DCP1A_WT_Untreated 0.989295 0.722275 0.995816 0.808274 0.993235
FMRP_WT_Untreated 0.987137 0.773902 0.992193 0.701526 0.994626
FUS_WT_Untreated 0.996876 0.918033 0.998853 0.952563 0.997946
G3BP1_WT_Untreated 0.997080 0.965369 0.997935 0.926507 0.999065
GM130_WT_Untreated 0.996983 0.943596 0.998298 0.931771 0.998610
KIF5A_WT_Untreated 0.991408 0.759077 0.997415 0.883642 0.993793
LAMP1_WT_Untreated 0.996667 0.960506 0.997628 0.914986 0.998949
MitoTracker_WT_Untreated 0.998514 0.963519 0.999392 0.975456 0.999085
NCL_WT_Untreated 0.998978 0.989796 0.999207 0.968899 0.999745
NEMO_WT_Untreated 0.995961 0.919366 0.997948 0.920805 0.997908
P54_WT_Untreated 0.997096 0.927734 0.998564 0.931800 0.998472
PEX14_WT_Untreated 0.996419 0.868160 0.999330 0.967111 0.997014
PML_WT_Untreated 0.990527 0.670445 0.996872 0.809474 0.993489
PSD95_WT_Untreated 0.986471 0.714771 0.992005 0.645541 0.994177
PURA_WT_Untreated 0.998012 0.985389 0.998316 0.933939 0.999647
Phalloidin_WT_Untreated 0.997441 0.913450 0.999468 0.976422 0.997915
SNCA_WT_Untreated 0.991719 0.942593 0.992791 0.740441 0.998740
SQSTM1_WT_Untreated 0.985234 0.533142 0.993392 0.592842 0.991590
TDP43_WT_Untreated 0.996424 0.918037 0.998130 0.914399 0.998216
TOMM20_WT_Untreated 0.998192 0.964070 0.999045 0.961841 0.999102
TUJ1_WT_Untreated 0.994662 0.995029 0.994571 0.978341 0.998770
Macro Average 0.994783 0.883177 0.997287 0.890338 0.997325
{'Accuracy': 0.9947826578166924,
'Sensitivity': 0.8831770315317321,
'Specificity': 0.9972867688807299,
'PPV': 0.8903383300699761,
'NPV': 0.9973248723561033}
## Baseline
run_baseline_model(
dataset_config= Cytoself_dataset_config,
batches=[1, 2, 3],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
)
2025-08-20 11:53:48 INFO: [load_embeddings] multiplex=False 2025-08-20 11:53:48 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 11:53:48 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 11:53:48 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
Loading all batches...
2025-08-20 11:53:56 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 11:53:58 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 11:54:00 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 11:54:01 INFO: [load_embeddings] embeddings shape: (112878, 2048) 2025-08-20 11:54:01 INFO: [load_embeddings] labels shape: (112878,) 2025-08-20 11:54:01 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 11:54:01 INFO: [load_embeddings] paths shape: (112878,) 2025-08-20 11:54:01 INFO: [load_embeddings] multiplex=False 2025-08-20 11:54:01 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 11:54:01 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 11:54:01 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/ 2025-08-20 11:54:07 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 11:54:10 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 11:54:11 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 11:54:12 INFO: [load_embeddings] embeddings shape: (91973, 2048) 2025-08-20 11:54:12 INFO: [load_embeddings] labels shape: (91973,) 2025-08-20 11:54:12 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 11:54:12 INFO: [load_embeddings] paths shape: (91973,) 2025-08-20 11:54:12 INFO: [load_embeddings] multiplex=False 2025-08-20 11:54:12 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 11:54:12 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 11:54:12 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/ 2025-08-20 11:54:18 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 11:54:21 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 11:54:22 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 11:54:23 INFO: [load_embeddings] embeddings shape: (85052, 2048) 2025-08-20 11:54:23 INFO: [load_embeddings] labels shape: (85052,) 2025-08-20 11:54:23 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 11:54:23 INFO: [load_embeddings] paths shape: (85052,)
Batches loaded.
Training on Batches: [2, 3], Testing on: [1].
=== Batch [1] ===
Train: (177025, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
Test: (112878, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
ANAX11_WT_Untreated: 3973
CLTC_WT_Untreated: 4580
Calreticulin_WT_Untreated: 4411
DAPI_WT_Untreated: 47422
DCP1A_WT_Untreated: 4220
FMRP_WT_Untreated: 4100
FUS_WT_Untreated: 4331
G3BP1_WT_Untreated: 4649
GM130_WT_Untreated: 4255
KIF5A_WT_Untreated: 4462
LAMP1_WT_Untreated: 4583
MitoTracker_WT_Untreated: 4331
NCL_WT_Untreated: 4312
NEMO_WT_Untreated: 4477
P54_WT_Untreated: 3667
PEX14_WT_Untreated: 3929
PML_WT_Untreated: 3441
PSD95_WT_Untreated: 3534
PURA_WT_Untreated: 4175
Phalloidin_WT_Untreated: 4171
SNCA_WT_Untreated: 3780
SQSTM1_WT_Untreated: 3138
TDP43_WT_Untreated: 3770
TOMM20_WT_Untreated: 4314
TUJ1_WT_Untreated: 35000
[W] [11:54:49.349903] L-BFGS: max iterations reached
[W] [11:54:49.350944] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
precision recall f1-score support
0 0.95 0.94 0.94 2614
1 0.94 0.98 0.96 2439
2 0.99 0.98 0.98 3056
3 1.00 1.00 1.00 30429
4 0.75 0.85 0.80 2364
5 0.83 0.76 0.79 2913
6 0.97 0.97 0.97 2728
7 0.98 0.98 0.98 2842
8 0.93 0.96 0.95 2371
9 0.76 0.85 0.80 2622
10 0.97 0.96 0.97 3067
11 0.99 0.99 0.99 2728
12 1.00 0.99 0.99 2709
13 0.94 0.96 0.95 2935
14 0.94 0.96 0.95 2623
15 0.96 0.98 0.97 2505
16 0.80 0.85 0.83 2297
17 0.73 0.68 0.71 2101
18 0.99 0.97 0.98 2712
19 0.92 0.97 0.95 2219
20 0.93 0.82 0.87 2454
21 0.68 0.61 0.65 2651
22 0.93 0.92 0.93 2535
23 0.97 0.99 0.98 2363
24 1.00 0.99 1.00 22601
accuracy 0.95 112878
macro avg 0.91 0.92 0.91 112878
weighted avg 0.95 0.95 0.95 112878
Training on Batches: [1, 3], Testing on: [2].
=== Batch [2] ===
Train: (197930, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
Test: (91973, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
ANAX11_WT_Untreated: 4464
CLTC_WT_Untreated: 4483
Calreticulin_WT_Untreated: 5388
DAPI_WT_Untreated: 53028
DCP1A_WT_Untreated: 4265
FMRP_WT_Untreated: 4405
FUS_WT_Untreated: 4823
G3BP1_WT_Untreated: 5226
GM130_WT_Untreated: 4516
KIF5A_WT_Untreated: 4980
LAMP1_WT_Untreated: 5407
MitoTracker_WT_Untreated: 4823
NCL_WT_Untreated: 4794
NEMO_WT_Untreated: 5052
P54_WT_Untreated: 4374
PEX14_WT_Untreated: 4360
PML_WT_Untreated: 3920
PSD95_WT_Untreated: 4004
PURA_WT_Untreated: 4797
Phalloidin_WT_Untreated: 4371
SNCA_WT_Untreated: 4311
SQSTM1_WT_Untreated: 4135
TDP43_WT_Untreated: 4371
TOMM20_WT_Untreated: 4563
TUJ1_WT_Untreated: 39070
[W] [11:55:11.191966] L-BFGS: max iterations reached
[W] [11:55:11.195505] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
precision recall f1-score support
0 0.95 0.95 0.95 2123
1 0.98 0.97 0.97 2536
2 0.98 0.97 0.98 2079
3 1.00 1.00 1.00 24823
4 0.80 0.79 0.80 2319
5 0.79 0.77 0.78 2608
6 0.96 0.94 0.95 2236
7 0.98 0.98 0.98 2265
8 0.94 0.94 0.94 2110
9 0.84 0.86 0.85 2104
10 0.96 0.97 0.96 2243
11 0.98 0.99 0.99 2236
12 0.99 0.98 0.99 2227
13 0.97 0.82 0.88 2360
14 0.94 0.95 0.94 1916
15 0.95 0.97 0.96 2074
16 0.79 0.82 0.80 1818
17 0.71 0.71 0.71 1631
18 0.98 0.98 0.98 2090
19 0.97 0.95 0.96 2019
20 0.86 0.88 0.87 1923
21 0.60 0.72 0.65 1654
22 0.92 0.92 0.92 1934
23 0.98 0.97 0.97 2114
24 0.99 1.00 1.00 18531
accuracy 0.95 91973
macro avg 0.91 0.91 0.91 91973
weighted avg 0.95 0.95 0.95 91973
Training on Batches: [1, 2], Testing on: [3].
=== Batch [3] ===
Train: (204851, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
Test: (85052, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
ANAX11_WT_Untreated: 4737
CLTC_WT_Untreated: 4975
Calreticulin_WT_Untreated: 5135
DAPI_WT_Untreated: 55252
DCP1A_WT_Untreated: 4683
FMRP_WT_Untreated: 5521
FUS_WT_Untreated: 4964
G3BP1_WT_Untreated: 5107
GM130_WT_Untreated: 4481
KIF5A_WT_Untreated: 4726
LAMP1_WT_Untreated: 5310
MitoTracker_WT_Untreated: 4964
NCL_WT_Untreated: 4936
NEMO_WT_Untreated: 5295
P54_WT_Untreated: 4539
PEX14_WT_Untreated: 4579
PML_WT_Untreated: 4115
PSD95_WT_Untreated: 3732
PURA_WT_Untreated: 4802
Phalloidin_WT_Untreated: 4238
SNCA_WT_Untreated: 4377
SQSTM1_WT_Untreated: 4305
TDP43_WT_Untreated: 4469
TOMM20_WT_Untreated: 4477
TUJ1_WT_Untreated: 41132
[W] [11:55:34.194890] L-BFGS: max iterations reached
[W] [11:55:34.200548] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
precision recall f1-score support
0 0.85 0.95 0.90 1850
1 0.97 0.70 0.82 2044
2 0.97 0.98 0.98 2332
3 1.00 1.00 1.00 22599
4 0.71 0.67 0.69 1901
5 0.54 0.58 0.56 1492
6 0.93 0.93 0.93 2095
7 0.92 0.94 0.93 2384
8 0.93 0.94 0.94 2145
9 0.93 0.78 0.85 2358
10 0.89 0.96 0.92 2340
11 0.96 0.95 0.96 2095
12 0.96 0.99 0.98 2085
13 0.82 0.97 0.89 2117
14 0.93 0.91 0.92 1751
15 0.96 0.76 0.85 1855
16 0.71 0.52 0.60 1623
17 0.52 0.60 0.56 1903
18 0.94 0.99 0.96 2085
19 0.97 0.96 0.96 2152
20 0.70 0.92 0.80 1857
21 0.51 0.30 0.38 1484
22 0.90 0.92 0.91 1836
23 0.96 0.97 0.97 2200
24 0.97 0.99 0.98 16469
accuracy 0.92 85052
macro avg 0.86 0.85 0.85 85052
weighted avg 0.92 0.92 0.92 85052
=== Overall Accuracy ===
0.939875249571192 [0.9517886567798862, 0.9498222304371935, 0.9180148614964962]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.996899 0.943525 0.998140 0.921833 0.998686
CLTC_WT_Untreated 0.996602 0.893860 0.999152 0.963156 0.997371
Calreticulin_WT_Untreated 0.998951 0.978706 0.999487 0.980545 0.999437
DAPI_WT_Untreated 0.999727 0.999576 0.999783 0.999409 0.999844
DCP1A_WT_Untreated 0.989300 0.780377 0.994155 0.756256 0.994892
FMRP_WT_Untreated 0.987385 0.724654 0.993899 0.746475 0.993179
FUS_WT_Untreated 0.997637 0.948576 0.998862 0.954118 0.998717
G3BP1_WT_Untreated 0.998086 0.965826 0.998941 0.960313 0.999093
GM130_WT_Untreated 0.997282 0.948989 0.998411 0.933215 0.998806
KIF5A_WT_Untreated 0.991690 0.825381 0.995856 0.833025 0.995627
LAMP1_WT_Untreated 0.997447 0.962222 0.998402 0.942268 0.998976
MitoTracker_WT_Untreated 0.998976 0.977334 0.999516 0.980529 0.999434
NCL_WT_Untreated 0.999383 0.989318 0.999632 0.985248 0.999735
NEMO_WT_Untreated 0.995457 0.916622 0.997526 0.906713 0.997812
P54_WT_Untreated 0.997289 0.939905 0.998561 0.935443 0.998667
PEX14_WT_Untreated 0.997192 0.911719 0.999132 0.959751 0.997999
PML_WT_Untreated 0.990721 0.750087 0.995580 0.774101 0.994957
PSD95_WT_Untreated 0.986385 0.664241 0.992771 0.645567 0.993340
PURA_WT_Untreated 0.998775 0.978220 0.999276 0.970470 0.999470
Phalloidin_WT_Untreated 0.998041 0.959937 0.998900 0.951598 0.999097
SNCA_WT_Untreated 0.993156 0.869586 0.995872 0.822360 0.997130
SQSTM1_WT_Untreated 0.984364 0.565901 0.992890 0.618580 0.991170
TDP43_WT_Untreated 0.996547 0.921015 0.998226 0.920285 0.998244
TOMM20_WT_Untreated 0.998762 0.974989 0.999322 0.971352 0.999410
TUJ1_WT_Untreated 0.996457 0.994115 0.997038 0.988128 0.998538
Macro Average 0.995300 0.895387 0.997573 0.896830 0.997585
run_train_test_split_baseline(
Cytoself_dataset_config,
batches=[1],
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
)
2025-08-20 11:55:37 INFO: [load_embeddings] multiplex=False 2025-08-20 11:55:37 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 11:55:37 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 11:55:37 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/ 2025-08-20 11:55:45 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 11:55:48 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 11:55:49 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 11:55:50 INFO: [load_embeddings] embeddings shape: (112878, 2048) 2025-08-20 11:55:50 INFO: [load_embeddings] labels shape: (112878,) 2025-08-20 11:55:50 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 11:55:50 INFO: [load_embeddings] paths shape: (112878,)
Train dataset
(90302,) (90302, 2048) [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
2: 2445
17: 1681
3: 24343
8: 1897
24: 18081
5: 2330
14: 2098
16: 1838
4: 1891
12: 2167
15: 2004
9: 2098
7: 2274
6: 2182
0: 2091
18: 2170
21: 2121
13: 2348
11: 2182
10: 2454
20: 1963
22: 2028
1: 1951
19: 1775
23: 1890
Test dataset
(22576,) (22576, 2048) [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
16: 459
15: 501
21: 530
24: 4520
3: 6086
12: 542
20: 491
19: 444
5: 583
13: 587
7: 568
6: 546
1: 488
8: 474
4: 473
0: 523
18: 542
22: 507
10: 613
2: 611
17: 420
11: 546
23: 473
9: 524
14: 525
precision recall f1-score support
0 0.95 0.97 0.96 523
1 0.98 0.96 0.97 488
2 1.00 0.99 0.99 611
3 1.00 1.00 1.00 6086
4 0.87 0.89 0.88 473
5 0.85 0.84 0.84 583
6 0.97 0.97 0.97 546
7 0.98 0.98 0.98 568
8 0.96 0.97 0.96 474
9 0.80 0.84 0.82 524
10 0.97 0.99 0.98 613
11 1.00 0.99 1.00 546
12 0.99 1.00 0.99 542
13 0.96 0.95 0.96 587
14 0.97 0.96 0.96 525
15 0.97 0.98 0.98 501
16 0.88 0.85 0.86 459
17 0.84 0.85 0.84 420
18 0.98 0.98 0.98 542
19 0.98 0.98 0.98 444
20 0.93 0.91 0.92 491
21 0.75 0.75 0.75 530
22 0.94 0.94 0.94 507
23 0.99 0.97 0.98 473
24 1.00 1.00 1.00 4520
accuracy 0.97 22576
macro avg 0.94 0.94 0.94 22576
weighted avg 0.97 0.97 0.97 22576
Accuracy: 0.9659
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.998184 0.967495 0.998912 0.954717 0.999229
CLTC_WT_Untreated 0.998848 0.963115 0.999638 0.983264 0.999185
Calreticulin_WT_Untreated 0.999601 0.990180 0.999863 0.995066 0.999727
DAPI_WT_Untreated 0.999779 0.999343 0.999939 0.999836 0.999757
DCP1A_WT_Untreated 0.994995 0.890063 0.997240 0.873444 0.997646
FMRP_WT_Untreated 0.991938 0.842196 0.995908 0.845095 0.995817
FUS_WT_Untreated 0.998405 0.967033 0.999183 0.967033 0.999183
G3BP1_WT_Untreated 0.999114 0.980634 0.999591 0.984099 0.999500
GM130_WT_Untreated 0.998450 0.968354 0.999095 0.958246 0.999321
KIF5A_WT_Untreated 0.991495 0.837786 0.995148 0.804029 0.996142
LAMP1_WT_Untreated 0.998804 0.985318 0.999180 0.971061 0.999590
MitoTracker_WT_Untreated 0.999779 0.994505 0.999909 0.996330 0.999864
NCL_WT_Untreated 0.999734 0.996310 0.999818 0.992647 0.999909
NEMO_WT_Untreated 0.997741 0.952300 0.998954 0.960481 0.998727
P54_WT_Untreated 0.998361 0.960000 0.999274 0.969231 0.999048
PEX14_WT_Untreated 0.999070 0.984032 0.999411 0.974308 0.999638
PML_WT_Untreated 0.994463 0.845316 0.997558 0.877828 0.996792
PSD95_WT_Untreated 0.994153 0.847619 0.996931 0.839623 0.997111
PURA_WT_Untreated 0.999026 0.981550 0.999455 0.977941 0.999546
Phalloidin_WT_Untreated 0.999114 0.979730 0.999503 0.975336 0.999593
SNCA_WT_Untreated 0.996545 0.912424 0.998415 0.927536 0.998054
SQSTM1_WT_Untreated 0.988306 0.747170 0.994103 0.752852 0.993923
TDP43_WT_Untreated 0.997431 0.942801 0.998686 0.942801 0.998686
TOMM20_WT_Untreated 0.999158 0.972516 0.999729 0.987124 0.999412
TUJ1_WT_Untreated 0.999291 0.998230 0.999557 0.998230 0.999557
Macro Average 0.997271 0.940241 0.998600 0.940326 0.998598
## Baseline
run_baseline_model(
dataset_config= Cytoself_dataset_config,
batches=[1, 2, 3],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=GaussianNB,
classifier_kwargs={},
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:33:54 INFO: [load_embeddings] multiplex=False 2025-08-20 16:33:54 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:33:54 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 16:33:54 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
Loading all batches...
2025-08-20 16:34:01 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:34:04 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:34:05 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:34:06 INFO: [load_embeddings] embeddings shape: (112878, 2048) 2025-08-20 16:34:06 INFO: [load_embeddings] labels shape: (112878,) 2025-08-20 16:34:06 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:34:06 INFO: [load_embeddings] paths shape: (112878,) 2025-08-20 16:34:06 INFO: [load_embeddings] multiplex=False 2025-08-20 16:34:06 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:34:06 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 16:34:06 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/ 2025-08-20 16:34:12 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:34:14 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:34:16 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:34:16 INFO: [load_embeddings] embeddings shape: (91973, 2048) 2025-08-20 16:34:16 INFO: [load_embeddings] labels shape: (91973,) 2025-08-20 16:34:16 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:34:16 INFO: [load_embeddings] paths shape: (91973,) 2025-08-20 16:34:16 INFO: [load_embeddings] multiplex=False 2025-08-20 16:34:16 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:34:16 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 16:34:16 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/ 2025-08-20 16:34:22 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:34:24 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:34:25 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:34:26 INFO: [load_embeddings] embeddings shape: (85052, 2048) 2025-08-20 16:34:26 INFO: [load_embeddings] labels shape: (85052,) 2025-08-20 16:34:26 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:34:26 INFO: [load_embeddings] paths shape: (85052,)
Batches loaded.
Training on Batches: [1], Testing on: [2].
=== Batch [2] ===
Train: (112878, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
Test: (91973, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
precision recall f1-score support
0 0.79 0.77 0.78 2123
1 0.87 0.81 0.84 2536
2 0.92 0.87 0.89 2079
3 1.00 0.99 0.99 24823
4 0.56 0.60 0.58 2319
5 0.60 0.42 0.49 2608
6 0.84 0.83 0.83 2236
7 0.79 0.92 0.85 2265
8 0.84 0.75 0.79 2110
9 0.63 0.65 0.64 2104
10 0.75 0.76 0.76 2243
11 0.76 0.94 0.84 2236
12 0.90 0.91 0.90 2227
13 0.83 0.66 0.73 2360
14 0.87 0.77 0.82 1916
15 0.78 0.91 0.84 2074
16 0.61 0.62 0.61 1818
17 0.42 0.80 0.55 1631
18 0.83 0.92 0.87 2090
19 0.84 0.79 0.81 2019
20 0.55 0.71 0.62 1923
21 0.40 0.29 0.34 1654
22 0.72 0.81 0.76 1934
23 0.87 0.78 0.82 2114
24 0.99 0.95 0.97 18531
accuracy 0.86 91973
macro avg 0.76 0.77 0.76 91973
weighted avg 0.86 0.86 0.86 91973
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (112878, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
Test: (85052, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
precision recall f1-score support
0 0.74 0.77 0.75 1850
1 0.81 0.46 0.59 2044
2 0.90 0.89 0.89 2332
3 1.00 0.99 0.99 22599
4 0.48 0.48 0.48 1901
5 0.23 0.17 0.20 1492
6 0.78 0.83 0.80 2095
7 0.66 0.91 0.76 2384
8 0.83 0.71 0.76 2145
9 0.79 0.60 0.68 2358
10 0.70 0.75 0.73 2340
11 0.73 0.92 0.82 2095
12 0.83 0.90 0.86 2085
13 0.74 0.82 0.78 2117
14 0.86 0.76 0.81 1751
15 0.72 0.78 0.75 1855
16 0.44 0.37 0.40 1623
17 0.35 0.55 0.43 1903
18 0.76 0.96 0.85 2085
19 0.89 0.76 0.82 2152
20 0.50 0.74 0.60 1857
21 0.31 0.14 0.19 1484
22 0.74 0.83 0.78 1836
23 0.80 0.76 0.78 2200
24 0.98 0.93 0.95 16469
accuracy 0.82 85052
macro avg 0.70 0.71 0.70 85052
weighted avg 0.83 0.82 0.82 85052
=== Overall Accuracy ===
0.8395543923084285 [0.8551422700140259, 0.8239665146028312]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.989391 0.767430 0.994487 0.761679 0.994660
CLTC_WT_Untreated 0.987990 0.651310 0.996932 0.849374 0.990796
Calreticulin_WT_Untreated 0.994752 0.878259 0.997729 0.908111 0.996892
DAPI_WT_Untreated 0.995758 0.986040 0.999313 0.998100 0.994915
DCP1A_WT_Untreated 0.977370 0.541706 0.988010 0.524553 0.988799
FMRP_WT_Untreated 0.975371 0.328049 0.990719 0.455932 0.984173
FUS_WT_Untreated 0.990911 0.827984 0.994997 0.805843 0.995683
G3BP1_WT_Untreated 0.988307 0.912669 0.990347 0.718300 0.997627
GM130_WT_Untreated 0.990018 0.729495 0.996435 0.834409 0.993359
KIF5A_WT_Untreated 0.983901 0.624160 0.993202 0.703638 0.990310
LAMP1_WT_Untreated 0.986324 0.756491 0.992432 0.726530 0.993521
MitoTracker_WT_Untreated 0.990611 0.930732 0.992113 0.747450 0.998252
NCL_WT_Untreated 0.994238 0.907236 0.996410 0.863195 0.997681
NEMO_WT_Untreated 0.988036 0.731740 0.994686 0.781302 0.993051
P54_WT_Untreated 0.992713 0.767112 0.997485 0.865805 0.995086
PEX14_WT_Untreated 0.990578 0.849071 0.993790 0.756291 0.996565
PML_WT_Untreated 0.981963 0.497820 0.991560 0.539018 0.990060
PSD95_WT_Untreated 0.972043 0.663271 0.978333 0.384073 0.993038
PURA_WT_Untreated 0.992634 0.938443 0.993943 0.789124 0.998506
Phalloidin_WT_Untreated 0.991849 0.774155 0.997102 0.865684 0.994564
SNCA_WT_Untreated 0.980076 0.725926 0.985622 0.524164 0.993969
SQSTM1_WT_Untreated 0.979568 0.218611 0.993300 0.370610 0.986002
TDP43_WT_Untreated 0.989696 0.817241 0.993449 0.730787 0.996013
TOMM20_WT_Untreated 0.990634 0.771210 0.996115 0.832166 0.994296
TUJ1_WT_Untreated 0.985595 0.938229 0.997268 0.988322 0.984965
Macro Average 0.987213 0.741376 0.993431 0.732978 0.993311
{'Accuracy': 0.9872131054935742,
'Sensitivity': 0.7413756087993754,
'Specificity': 0.993431117325448,
'PPV': 0.7329784414077817,
'NPV': 0.9933113358320611}
## Baseline
run_baseline_model(
dataset_config= Cytoself_dataset_config,
batches=[1, 2, 3],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=RidgeClassifier,
classifier_kwargs={},
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:37:06 INFO: [load_embeddings] multiplex=False 2025-08-20 16:37:06 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:37:06 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 16:37:06 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
Loading all batches...
2025-08-20 16:37:21 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:37:27 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:37:31 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:37:32 INFO: [load_embeddings] embeddings shape: (112878, 2048) 2025-08-20 16:37:32 INFO: [load_embeddings] labels shape: (112878,) 2025-08-20 16:37:32 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:37:32 INFO: [load_embeddings] paths shape: (112878,) 2025-08-20 16:37:33 INFO: [load_embeddings] multiplex=False 2025-08-20 16:37:33 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:37:33 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 16:37:33 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/ 2025-08-20 16:37:46 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:37:50 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:37:53 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:37:54 INFO: [load_embeddings] embeddings shape: (91973, 2048) 2025-08-20 16:37:54 INFO: [load_embeddings] labels shape: (91973,) 2025-08-20 16:37:54 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:37:54 INFO: [load_embeddings] paths shape: (91973,) 2025-08-20 16:37:55 INFO: [load_embeddings] multiplex=False 2025-08-20 16:37:55 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:37:55 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 16:37:55 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/ 2025-08-20 16:38:07 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:38:11 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:38:14 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:38:15 INFO: [load_embeddings] embeddings shape: (85052, 2048) 2025-08-20 16:38:15 INFO: [load_embeddings] labels shape: (85052,) 2025-08-20 16:38:15 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:38:15 INFO: [load_embeddings] paths shape: (85052,)
Batches loaded. Training on Batches: [1], Testing on: [2]. === Batch [2] === Train: (112878, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] Test: (91973, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] ANAX11_WT_Untreated: 2614 CLTC_WT_Untreated: 2439 Calreticulin_WT_Untreated: 3056 DAPI_WT_Untreated: 30429 DCP1A_WT_Untreated: 2364 FMRP_WT_Untreated: 2913 FUS_WT_Untreated: 2728 G3BP1_WT_Untreated: 2842 GM130_WT_Untreated: 2371 KIF5A_WT_Untreated: 2622 LAMP1_WT_Untreated: 3067 MitoTracker_WT_Untreated: 2728 NCL_WT_Untreated: 2709 NEMO_WT_Untreated: 2935 P54_WT_Untreated: 2623 PEX14_WT_Untreated: 2505 PML_WT_Untreated: 2297 PSD95_WT_Untreated: 2101 PURA_WT_Untreated: 2712 Phalloidin_WT_Untreated: 2219 SNCA_WT_Untreated: 2454 SQSTM1_WT_Untreated: 2651 TDP43_WT_Untreated: 2535 TOMM20_WT_Untreated: 2363 TUJ1_WT_Untreated: 22601
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/linear_model/_ridge.py:211: LinAlgWarning: Ill-conditioned matrix (rcond=3.35676e-10): result may not be accurate. return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
precision recall f1-score support
0 0.84 0.93 0.88 2123
1 0.95 0.92 0.93 2536
2 0.89 0.92 0.91 2079
3 0.99 1.00 1.00 24823
4 0.84 0.71 0.77 2319
5 0.76 0.72 0.74 2608
6 0.87 0.91 0.89 2236
7 0.91 0.96 0.94 2265
8 0.86 0.92 0.89 2110
9 0.77 0.75 0.76 2104
10 0.84 0.93 0.88 2243
11 0.98 0.93 0.95 2236
12 0.98 0.92 0.95 2227
13 0.83 0.91 0.87 2360
14 0.89 0.89 0.89 1916
15 0.89 0.96 0.92 2074
16 0.88 0.55 0.67 1818
17 0.82 0.67 0.74 1631
18 0.79 0.97 0.87 2090
19 0.96 0.76 0.85 2019
20 0.79 0.83 0.81 1923
21 0.66 0.56 0.60 1654
22 0.83 0.84 0.84 1934
23 0.93 0.91 0.92 2114
24 0.97 1.00 0.98 18531
accuracy 0.92 91973
macro avg 0.87 0.85 0.86 91973
weighted avg 0.92 0.92 0.92 91973
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (112878, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
Test: (85052, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/linear_model/_ridge.py:211: LinAlgWarning: Ill-conditioned matrix (rcond=3.35676e-10): result may not be accurate. return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
precision recall f1-score support
0 0.71 0.90 0.79 1850
1 0.89 0.43 0.58 2044
2 0.91 0.95 0.93 2332
3 0.99 1.00 0.99 22599
4 0.71 0.50 0.59 1901
5 0.38 0.36 0.37 1492
6 0.86 0.82 0.84 2095
7 0.81 0.93 0.87 2384
8 0.89 0.85 0.87 2145
9 0.78 0.56 0.65 2358
10 0.74 0.94 0.83 2340
11 0.96 0.84 0.89 2095
12 0.97 0.96 0.96 2085
13 0.63 0.99 0.77 2117
14 0.87 0.83 0.85 1751
15 0.83 0.69 0.75 1855
16 0.78 0.35 0.49 1623
17 0.51 0.39 0.44 1903
18 0.67 0.98 0.79 2085
19 0.98 0.72 0.83 2152
20 0.57 0.79 0.66 1857
21 0.46 0.23 0.30 1484
22 0.79 0.86 0.82 1836
23 0.91 0.91 0.91 2200
24 0.93 0.98 0.96 16469
accuracy 0.86 85052
macro avg 0.78 0.75 0.75 85052
weighted avg 0.86 0.86 0.85 85052
=== Overall Accuracy ===
0.8897641608992843 [0.9188783664771183, 0.8606499553214504]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.992035 0.913416 0.993840 0.772950 0.998004
CLTC_WT_Untreated 0.990905 0.697380 0.998701 0.934465 0.992016
Calreticulin_WT_Untreated 0.995893 0.937202 0.997393 0.901832 0.998394
DAPI_WT_Untreated 0.997430 0.999873 0.996536 0.990619 0.999954
DCP1A_WT_Untreated 0.986748 0.614929 0.995828 0.782569 0.990645
FMRP_WT_Untreated 0.982296 0.585610 0.991702 0.625912 0.990190
FUS_WT_Untreated 0.993459 0.866774 0.996636 0.865975 0.996659
G3BP1_WT_Untreated 0.994515 0.946870 0.995800 0.858759 0.998563
GM130_WT_Untreated 0.994170 0.885546 0.996846 0.873638 0.997180
KIF5A_WT_Untreated 0.986318 0.650381 0.995005 0.770988 0.990996
LAMP1_WT_Untreated 0.991606 0.936723 0.993064 0.782110 0.998309
MitoTracker_WT_Untreated 0.996469 0.886631 0.999224 0.966281 0.997163
NCL_WT_Untreated 0.997995 0.941095 0.999415 0.975715 0.998531
NEMO_WT_Untreated 0.989165 0.947063 0.990258 0.716095 0.998615
P54_WT_Untreated 0.994713 0.859013 0.997583 0.882600 0.997019
PEX14_WT_Untreated 0.993363 0.831000 0.997048 0.864672 0.996167
PML_WT_Untreated 0.987708 0.454519 0.998277 0.839506 0.989284
PSD95_WT_Untreated 0.985070 0.516978 0.994605 0.661238 0.990204
PURA_WT_Untreated 0.990662 0.974371 0.991056 0.724617 0.999376
Phalloidin_WT_Untreated 0.993278 0.741309 0.999358 0.965345 0.993793
SNCA_WT_Untreated 0.987301 0.814550 0.991070 0.665586 0.995934
SQSTM1_WT_Untreated 0.984432 0.400255 0.994974 0.589671 0.989239
TDP43_WT_Untreated 0.992560 0.850133 0.995660 0.809957 0.996735
TOMM20_WT_Untreated 0.995865 0.911219 0.997979 0.918458 0.997783
TUJ1_WT_Untreated 0.987849 0.989971 0.987326 0.950616 0.997503
Macro Average 0.991272 0.806112 0.995407 0.827607 0.995530
{'Accuracy': 0.9912721931930517,
'Sensitivity': 0.8061124950415774,
'Specificity': 0.9954072922750745,
'PPV': 0.827607007333263,
'NPV': 0.99553027214942}
## Baseline
run_baseline_model(
dataset_config= Cytoself_dataset_config,
batches=[1, 2, 3],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=LinearSVC,
classifier_kwargs={"C": 1.0, "max_iter": 1000, "random_state": 42},
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:39:41 INFO: [load_embeddings] multiplex=False 2025-08-20 16:39:41 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:39:41 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 16:39:41 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
Loading all batches...
2025-08-20 16:39:56 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:40:02 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:40:05 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:40:07 INFO: [load_embeddings] embeddings shape: (112878, 2048) 2025-08-20 16:40:07 INFO: [load_embeddings] labels shape: (112878,) 2025-08-20 16:40:07 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:40:07 INFO: [load_embeddings] paths shape: (112878,) 2025-08-20 16:40:08 INFO: [load_embeddings] multiplex=False 2025-08-20 16:40:08 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:40:08 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 16:40:08 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/ 2025-08-20 16:40:20 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:40:24 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:40:27 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:40:28 INFO: [load_embeddings] embeddings shape: (91973, 2048) 2025-08-20 16:40:28 INFO: [load_embeddings] labels shape: (91973,) 2025-08-20 16:40:28 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:40:28 INFO: [load_embeddings] paths shape: (91973,) 2025-08-20 16:40:29 INFO: [load_embeddings] multiplex=False 2025-08-20 16:40:29 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:40:29 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 16:40:29 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/ 2025-08-20 16:40:40 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:40:45 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:40:48 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:40:49 INFO: [load_embeddings] embeddings shape: (85052, 2048) 2025-08-20 16:40:49 INFO: [load_embeddings] labels shape: (85052,) 2025-08-20 16:40:49 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:40:49 INFO: [load_embeddings] paths shape: (85052,)
Batches loaded. Training on Batches: [1], Testing on: [2]. === Batch [2] === Train: (112878, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] Test: (91973, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] ANAX11_WT_Untreated: 2614 CLTC_WT_Untreated: 2439 Calreticulin_WT_Untreated: 3056 DAPI_WT_Untreated: 30429 DCP1A_WT_Untreated: 2364 FMRP_WT_Untreated: 2913 FUS_WT_Untreated: 2728 G3BP1_WT_Untreated: 2842 GM130_WT_Untreated: 2371 KIF5A_WT_Untreated: 2622 LAMP1_WT_Untreated: 3067 MitoTracker_WT_Untreated: 2728 NCL_WT_Untreated: 2709 NEMO_WT_Untreated: 2935 P54_WT_Untreated: 2623 PEX14_WT_Untreated: 2505 PML_WT_Untreated: 2297 PSD95_WT_Untreated: 2101 PURA_WT_Untreated: 2712 Phalloidin_WT_Untreated: 2219 SNCA_WT_Untreated: 2454 SQSTM1_WT_Untreated: 2651 TDP43_WT_Untreated: 2535 TOMM20_WT_Untreated: 2363 TUJ1_WT_Untreated: 22601
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning. warnings.warn( /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/svm/_base.py:1242: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. warnings.warn(
precision recall f1-score support
0 0.91 0.95 0.93 2123
1 0.97 0.94 0.96 2536
2 0.97 0.95 0.96 2079
3 1.00 1.00 1.00 24823
4 0.83 0.70 0.76 2319
5 0.61 0.89 0.73 2608
6 0.96 0.91 0.94 2236
7 0.96 0.97 0.97 2265
8 0.94 0.92 0.93 2110
9 0.80 0.77 0.78 2104
10 0.93 0.95 0.94 2243
11 0.98 0.98 0.98 2236
12 0.98 0.99 0.98 2227
13 0.95 0.83 0.89 2360
14 0.93 0.91 0.92 1916
15 0.97 0.95 0.96 2074
16 0.87 0.67 0.75 1818
17 0.76 0.76 0.76 1631
18 0.95 0.98 0.96 2090
19 0.96 0.89 0.93 2019
20 0.82 0.89 0.86 1923
21 0.61 0.59 0.60 1654
22 0.87 0.88 0.88 1934
23 0.97 0.95 0.96 2114
24 0.99 1.00 1.00 18531
accuracy 0.94 91973
macro avg 0.90 0.89 0.89 91973
weighted avg 0.94 0.94 0.94 91973
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (112878, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
Test: (85052, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning. warnings.warn( /home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/svm/_base.py:1242: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. warnings.warn(
precision recall f1-score support
0 0.77 0.94 0.84 1850
1 0.95 0.59 0.73 2044
2 0.93 0.96 0.95 2332
3 1.00 1.00 1.00 22599
4 0.70 0.55 0.62 1901
5 0.36 0.67 0.47 1492
6 0.93 0.86 0.89 2095
7 0.86 0.94 0.90 2384
8 0.94 0.90 0.92 2145
9 0.85 0.59 0.69 2358
10 0.89 0.94 0.92 2340
11 0.95 0.94 0.95 2095
12 0.94 0.99 0.96 2085
13 0.87 0.95 0.91 2117
14 0.91 0.88 0.89 1751
15 0.95 0.75 0.84 1855
16 0.78 0.40 0.53 1623
17 0.46 0.50 0.48 1903
18 0.86 0.99 0.92 2085
19 0.97 0.88 0.92 2152
20 0.66 0.90 0.76 1857
21 0.34 0.22 0.27 1484
22 0.87 0.90 0.88 1836
23 0.95 0.95 0.95 2200
24 0.97 0.99 0.98 16469
accuracy 0.89 85052
macro avg 0.83 0.81 0.81 85052
weighted avg 0.90 0.89 0.89 85052
=== Overall Accuracy ===
0.9159204645033483 [0.9387755102040817, 0.8930654188026149]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.994617 0.945885 0.995735 0.835854 0.998754
CLTC_WT_Untreated 0.993679 0.782969 0.999275 0.966316 0.994265
Calreticulin_WT_Untreated 0.997611 0.958513 0.998610 0.946285 0.998939
DAPI_WT_Untreated 0.999362 0.999852 0.999182 0.997769 0.999946
DCP1A_WT_Untreated 0.986798 0.635782 0.995371 0.770313 0.991143
FMRP_WT_Untreated 0.977286 0.812927 0.981183 0.505997 0.995500
FUS_WT_Untreated 0.996001 0.887555 0.998720 0.945633 0.997184
G3BP1_WT_Untreated 0.996311 0.953753 0.997459 0.910099 0.998751
GM130_WT_Untreated 0.996458 0.909048 0.998611 0.941577 0.997762
KIF5A_WT_Untreated 0.988120 0.671672 0.996303 0.824484 0.991551
LAMP1_WT_Untreated 0.996210 0.946323 0.997535 0.910752 0.998572
MitoTracker_WT_Untreated 0.998300 0.962133 0.999207 0.968169 0.999050
NCL_WT_Untreated 0.998650 0.987477 0.998929 0.958361 0.999687
NEMO_WT_Untreated 0.994967 0.888765 0.997722 0.910110 0.997116
P54_WT_Untreated 0.996215 0.898009 0.998293 0.917526 0.997844
PEX14_WT_Untreated 0.996051 0.856198 0.999226 0.961692 0.996744
PML_WT_Untreated 0.989013 0.543447 0.997845 0.833333 0.991012
PSD95_WT_Untreated 0.983884 0.621109 0.991273 0.591804 0.992274
PURA_WT_Untreated 0.997023 0.981557 0.997397 0.901055 0.999554
Phalloidin_WT_Untreated 0.996588 0.887797 0.999213 0.964574 0.997298
SNCA_WT_Untreated 0.990849 0.894974 0.992941 0.734477 0.997697
SQSTM1_WT_Untreated 0.982483 0.415551 0.992714 0.507196 0.989487
TDP43_WT_Untreated 0.994769 0.889390 0.997062 0.868203 0.997592
TOMM20_WT_Untreated 0.997740 0.949930 0.998935 0.957029 0.998750
TUJ1_WT_Untreated 0.994645 0.992886 0.995078 0.980282 0.998241
Macro Average 0.993345 0.850940 0.996553 0.864356 0.996589
{'Accuracy': 0.9933451207456574,
'Sensitivity': 0.8509400883652646,
'Specificity': 0.9965527205134995,
'PPV': 0.8643556633095661,
'NPV': 0.9965885035678497}
run_baseline_model(
Cytoself_dataset_config,
batches=[1,2,3,],
classifier_class=cuRF,
classifier_kwargs={"n_estimators": 300, "random_state": 42}, # max_depth=0 => unlimited in cuML
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:49:28 INFO: [load_embeddings] multiplex=False 2025-08-20 16:49:28 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:49:28 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 16:49:28 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
Loading all batches...
2025-08-20 16:49:43 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:49:49 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:49:53 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:49:55 INFO: [load_embeddings] embeddings shape: (112878, 2048) 2025-08-20 16:49:55 INFO: [load_embeddings] labels shape: (112878,) 2025-08-20 16:49:55 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:49:55 INFO: [load_embeddings] paths shape: (112878,) 2025-08-20 16:49:56 INFO: [load_embeddings] multiplex=False 2025-08-20 16:49:56 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:49:56 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 16:49:56 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/ 2025-08-20 16:50:08 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:50:12 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:50:15 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:50:16 INFO: [load_embeddings] embeddings shape: (91973, 2048) 2025-08-20 16:50:16 INFO: [load_embeddings] labels shape: (91973,) 2025-08-20 16:50:16 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:50:16 INFO: [load_embeddings] paths shape: (91973,) 2025-08-20 16:50:17 INFO: [load_embeddings] multiplex=False 2025-08-20 16:50:17 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:50:17 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 16:50:17 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/ 2025-08-20 16:50:39 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:50:44 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:50:47 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:50:48 INFO: [load_embeddings] embeddings shape: (85052, 2048) 2025-08-20 16:50:48 INFO: [load_embeddings] labels shape: (85052,) 2025-08-20 16:50:48 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:50:48 INFO: [load_embeddings] paths shape: (85052,)
Batches loaded. Training on Batches: [1], Testing on: [2]. === Batch [2] === Train: (112878, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] Test: (91973, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] ANAX11_WT_Untreated: 2614 CLTC_WT_Untreated: 2439 Calreticulin_WT_Untreated: 3056 DAPI_WT_Untreated: 30429 DCP1A_WT_Untreated: 2364 FMRP_WT_Untreated: 2913 FUS_WT_Untreated: 2728 G3BP1_WT_Untreated: 2842 GM130_WT_Untreated: 2371 KIF5A_WT_Untreated: 2622 LAMP1_WT_Untreated: 3067 MitoTracker_WT_Untreated: 2728 NCL_WT_Untreated: 2709 NEMO_WT_Untreated: 2935 P54_WT_Untreated: 2623 PEX14_WT_Untreated: 2505 PML_WT_Untreated: 2297 PSD95_WT_Untreated: 2101 PURA_WT_Untreated: 2712 Phalloidin_WT_Untreated: 2219 SNCA_WT_Untreated: 2454 SQSTM1_WT_Untreated: 2651 TDP43_WT_Untreated: 2535 TOMM20_WT_Untreated: 2363 TUJ1_WT_Untreated: 22601
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/cuml/internals/api_decorators.py:344: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set return func(**kwargs)
precision recall f1-score support
0 0.76 0.91 0.83 2123
1 0.92 0.74 0.82 2536
2 0.92 0.86 0.89 2079
3 0.99 1.00 1.00 24823
4 0.73 0.60 0.66 2319
5 0.61 0.76 0.68 2608
6 0.88 0.93 0.90 2236
7 0.87 0.91 0.89 2265
8 0.82 0.86 0.84 2110
9 0.74 0.61 0.66 2104
10 0.77 0.95 0.85 2243
11 0.97 0.66 0.78 2236
12 0.99 0.91 0.95 2227
13 0.75 0.89 0.81 2360
14 0.90 0.81 0.85 1916
15 0.92 0.94 0.93 2074
16 0.84 0.38 0.53 1818
17 0.80 0.52 0.63 1631
18 0.85 0.96 0.90 2090
19 0.90 0.39 0.54 2019
20 0.76 0.74 0.75 1923
21 0.53 0.49 0.51 1654
22 0.77 0.87 0.81 1934
23 0.95 0.80 0.87 2114
24 0.88 1.00 0.94 18531
accuracy 0.88 91973
macro avg 0.83 0.78 0.79 91973
weighted avg 0.88 0.88 0.87 91973
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (112878, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
Test: (85052, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/cuml/internals/api_decorators.py:344: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set return func(**kwargs)
precision recall f1-score support
0 0.62 0.89 0.73 1850
1 0.66 0.12 0.20 2044
2 0.91 0.91 0.91 2332
3 0.98 1.00 0.99 22599
4 0.54 0.38 0.45 1901
5 0.34 0.53 0.41 1492
6 0.82 0.81 0.82 2095
7 0.84 0.75 0.79 2384
8 0.85 0.84 0.84 2145
9 0.83 0.43 0.57 2358
10 0.67 0.94 0.78 2340
11 0.97 0.49 0.65 2095
12 0.97 0.96 0.96 2085
13 0.63 0.99 0.77 2117
14 0.91 0.77 0.83 1751
15 0.85 0.61 0.71 1855
16 0.74 0.16 0.26 1623
17 0.51 0.26 0.35 1903
18 0.74 0.97 0.84 2085
19 0.98 0.41 0.58 2152
20 0.61 0.75 0.67 1857
21 0.43 0.36 0.39 1484
22 0.77 0.88 0.82 1836
23 0.91 0.86 0.88 2200
24 0.81 0.99 0.89 16469
accuracy 0.82 85052
macro avg 0.75 0.68 0.68 85052
weighted avg 0.83 0.82 0.80 85052
=== Overall Accuracy ===
0.8487214343931828 [0.8779206941167517, 0.8195221746696139]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.988736 0.901586 0.990737 0.690839 0.997725
CLTC_WT_Untreated 0.984522 0.462445 0.998388 0.883973 0.985902
Calreticulin_WT_Untreated 0.995080 0.883246 0.997938 0.916275 0.997019
DAPI_WT_Untreated 0.996249 0.999937 0.994900 0.986252 0.999977
DCP1A_WT_Untreated 0.981731 0.500474 0.993484 0.652254 0.987870
FMRP_WT_Untreated 0.976608 0.679512 0.983652 0.496348 0.992334
FUS_WT_Untreated 0.993182 0.869083 0.996294 0.854678 0.996715
G3BP1_WT_Untreated 0.991883 0.827920 0.996305 0.858003 0.995363
GM130_WT_Untreated 0.992317 0.848649 0.995856 0.834527 0.996271
KIF5A_WT_Untreated 0.983957 0.514343 0.996100 0.773248 0.987550
LAMP1_WT_Untreated 0.988894 0.947196 0.990002 0.715746 0.998584
MitoTracker_WT_Untreated 0.989211 0.578850 0.999502 0.966834 0.989543
NCL_WT_Untreated 0.997932 0.933673 0.999537 0.980516 0.998346
NEMO_WT_Untreated 0.987442 0.933884 0.988832 0.684512 0.998268
P54_WT_Untreated 0.993961 0.792746 0.998218 0.903918 0.995627
PEX14_WT_Untreated 0.993170 0.787987 0.997828 0.891705 0.995200
PML_WT_Untreated 0.984697 0.278117 0.998704 0.809645 0.985874
PSD95_WT_Untreated 0.983669 0.381154 0.995942 0.656753 0.987501
PURA_WT_Untreated 0.993250 0.967665 0.993868 0.792157 0.999215
Phalloidin_WT_Untreated 0.985234 0.398945 0.999381 0.939582 0.985695
SNCA_WT_Untreated 0.986872 0.745503 0.992138 0.674163 0.994434
SQSTM1_WT_Untreated 0.981726 0.428617 0.991707 0.482598 0.989709
TDP43_WT_Untreated 0.991645 0.875066 0.994182 0.765962 0.997273
TOMM20_WT_Untreated 0.994323 0.833102 0.998350 0.926527 0.995842
TUJ1_WT_Untreated 0.963435 0.996943 0.955177 0.845706 0.999212
Macro Average 0.987989 0.734666 0.993481 0.799309 0.993882
{'Accuracy': 0.9879890410958904,
'Sensitivity': 0.7346657636295809,
'Specificity': 0.9934807932408529,
'PPV': 0.7993088532262571,
'NPV': 0.9938820322409838}
run_baseline_model(Cytoself_dataset_config,
batches=[1,2,3],
classifier_class=ExtraTreesClassifier,
classifier_kwargs={"n_estimators": 300, "max_depth": None, "min_samples_leaf": 1,
"n_jobs": -1, "random_state": 42},
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 16:53:07 INFO: [load_embeddings] multiplex=False 2025-08-20 16:53:07 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:53:07 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 16:53:07 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/
Loading all batches...
2025-08-20 16:53:21 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:53:27 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:53:31 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:53:33 INFO: [load_embeddings] embeddings shape: (112878, 2048) 2025-08-20 16:53:33 INFO: [load_embeddings] labels shape: (112878,) 2025-08-20 16:53:33 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:53:33 INFO: [load_embeddings] paths shape: (112878,) 2025-08-20 16:53:33 INFO: [load_embeddings] multiplex=False 2025-08-20 16:53:33 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:53:33 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 16:53:33 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/ 2025-08-20 16:53:45 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:53:49 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:53:52 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:53:53 INFO: [load_embeddings] embeddings shape: (91973, 2048) 2025-08-20 16:53:53 INFO: [load_embeddings] labels shape: (91973,) 2025-08-20 16:53:53 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:53:53 INFO: [load_embeddings] paths shape: (91973,) 2025-08-20 16:53:54 INFO: [load_embeddings] multiplex=False 2025-08-20 16:53:54 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 16:53:54 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 16:53:54 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/cytoself_model/ 2025-08-20 16:54:54 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 16:54:58 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 16:55:01 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 16:55:02 INFO: [load_embeddings] embeddings shape: (85052, 2048) 2025-08-20 16:55:02 INFO: [load_embeddings] labels shape: (85052,) 2025-08-20 16:55:02 INFO: [load_embeddings] example label: ANAX11_WT_Untreated 2025-08-20 16:55:02 INFO: [load_embeddings] paths shape: (85052,)
Batches loaded.
Training on Batches: [1], Testing on: [2].
=== Batch [2] ===
Train: (112878, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
Test: (91973, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
precision recall f1-score support
0 0.76 0.93 0.83 2123
1 0.95 0.76 0.85 2536
2 0.93 0.85 0.89 2079
3 0.99 1.00 1.00 24823
4 0.77 0.60 0.67 2319
5 0.63 0.74 0.68 2608
6 0.88 0.93 0.90 2236
7 0.89 0.90 0.90 2265
8 0.85 0.87 0.86 2110
9 0.70 0.63 0.66 2104
10 0.79 0.96 0.87 2243
11 0.98 0.44 0.61 2236
12 0.99 0.94 0.97 2227
13 0.77 0.90 0.83 2360
14 0.91 0.86 0.88 1916
15 0.92 0.96 0.94 2074
16 0.84 0.48 0.61 1818
17 0.82 0.53 0.64 1631
18 0.87 0.98 0.92 2090
19 0.92 0.39 0.54 2019
20 0.76 0.76 0.76 1923
21 0.50 0.48 0.49 1654
22 0.81 0.86 0.84 1934
23 0.96 0.78 0.86 2114
24 0.85 1.00 0.92 18531
accuracy 0.88 91973
macro avg 0.84 0.78 0.80 91973
weighted avg 0.88 0.88 0.87 91973
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (112878, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
Test: (85052, 2048) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24]
ANAX11_WT_Untreated: 2614
CLTC_WT_Untreated: 2439
Calreticulin_WT_Untreated: 3056
DAPI_WT_Untreated: 30429
DCP1A_WT_Untreated: 2364
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
G3BP1_WT_Untreated: 2842
GM130_WT_Untreated: 2371
KIF5A_WT_Untreated: 2622
LAMP1_WT_Untreated: 3067
MitoTracker_WT_Untreated: 2728
NCL_WT_Untreated: 2709
NEMO_WT_Untreated: 2935
P54_WT_Untreated: 2623
PEX14_WT_Untreated: 2505
PML_WT_Untreated: 2297
PSD95_WT_Untreated: 2101
PURA_WT_Untreated: 2712
Phalloidin_WT_Untreated: 2219
SNCA_WT_Untreated: 2454
SQSTM1_WT_Untreated: 2651
TDP43_WT_Untreated: 2535
TOMM20_WT_Untreated: 2363
TUJ1_WT_Untreated: 22601
precision recall f1-score support
0 0.65 0.91 0.76 1850
1 0.71 0.10 0.18 2044
2 0.91 0.91 0.91 2332
3 0.99 1.00 0.99 22599
4 0.57 0.39 0.46 1901
5 0.34 0.53 0.41 1492
6 0.82 0.85 0.84 2095
7 0.87 0.74 0.80 2384
8 0.86 0.84 0.85 2145
9 0.77 0.47 0.58 2358
10 0.69 0.95 0.80 2340
11 0.99 0.26 0.41 2095
12 0.97 0.97 0.97 2085
13 0.63 0.99 0.77 2117
14 0.91 0.83 0.87 1751
15 0.84 0.66 0.74 1855
16 0.75 0.21 0.33 1623
17 0.50 0.25 0.33 1903
18 0.77 0.98 0.86 2085
19 0.98 0.42 0.59 2152
20 0.65 0.79 0.71 1857
21 0.44 0.35 0.39 1484
22 0.82 0.88 0.85 1836
23 0.94 0.83 0.88 2200
24 0.78 1.00 0.87 16469
accuracy 0.82 85052
macro avg 0.77 0.68 0.69 85052
weighted avg 0.83 0.82 0.80 85052
=== Overall Accuracy ===
0.8488203606606268 [0.8778010937992672, 0.8198396275219866]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.989470 0.919960 0.991066 0.702749 0.998149
CLTC_WT_Untreated 0.985177 0.467031 0.998939 0.921189 0.986028
Calreticulin_WT_Untreated 0.995227 0.885287 0.998036 0.920123 0.997071
DAPI_WT_Untreated 0.997328 0.999958 0.996366 0.990165 0.999985
DCP1A_WT_Untreated 0.982765 0.502844 0.994485 0.690081 0.987939
FMRP_WT_Untreated 0.976856 0.661707 0.984328 0.500277 0.991917
FUS_WT_Untreated 0.993492 0.891249 0.996057 0.850033 0.997269
G3BP1_WT_Untreated 0.992459 0.821682 0.997065 0.883033 0.995200
GM130_WT_Untreated 0.993046 0.855229 0.996440 0.855430 0.996435
KIF5A_WT_Untreated 0.983516 0.543478 0.994895 0.733515 0.988274
LAMP1_WT_Untreated 0.989866 0.956360 0.990756 0.733311 0.998831
MitoTracker_WT_Untreated 0.984036 0.352805 0.999867 0.985171 0.984026
NCL_WT_Untreated 0.998503 0.957096 0.999537 0.980984 0.998930
NEMO_WT_Untreated 0.987985 0.942819 0.989157 0.692876 0.998502
P54_WT_Untreated 0.995001 0.845378 0.998166 0.906963 0.996734
PEX14_WT_Untreated 0.993639 0.815475 0.997683 0.888766 0.995819
PML_WT_Untreated 0.985838 0.351351 0.998416 0.814690 0.987285
PSD95_WT_Untreated 0.983788 0.378891 0.996109 0.664846 0.987458
PURA_WT_Untreated 0.994153 0.977725 0.994550 0.812500 0.999459
Phalloidin_WT_Untreated 0.985465 0.403261 0.999514 0.952435 0.985798
SNCA_WT_Untreated 0.988103 0.771429 0.992831 0.701299 0.995002
SQSTM1_WT_Untreated 0.981511 0.417782 0.991684 0.475517 0.989516
TDP43_WT_Untreated 0.992978 0.869496 0.995665 0.813601 0.997156
TOMM20_WT_Untreated 0.994182 0.804590 0.998917 0.948879 0.995137
TUJ1_WT_Untreated 0.955520 0.997543 0.945165 0.817620 0.999360
Macro Average 0.987996 0.735617 0.993428 0.809442 0.993891
{'Accuracy': 0.9879962717130348,
'Sensitivity': 0.7356169771338387,
'Specificity': 0.9934277589600906,
'PPV': 0.8094420820850715,
'NPV': 0.9938912399879954}
pretrained_dataset_config = {
"path_to_embeddings": "/home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model",
"multiplexed": False,
"config_fmt": "NIH_UMAP1_DatasetConfig_B{batch}",
"config_dir": "manuscript/manuscript_figures_data_config",
}
## Baseline
run_baseline_model(
dataset_config= pretrained_dataset_config,
batches=[1, 2, 3],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=cuMLLogisticRegression,
classifier_kwargs={},
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 17:37:26 INFO: [load_embeddings] multiplex=False 2025-08-20 17:37:26 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:37:26 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 17:37:26 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
Loading all batches...
2025-08-20 17:37:30 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:37:31 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:37:31 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:37:32 INFO: [load_embeddings] embeddings shape: (115590, 192) 2025-08-20 17:37:32 INFO: [load_embeddings] labels shape: (115590,) 2025-08-20 17:37:32 INFO: [load_embeddings] example label: CLTC_WT_Untreated 2025-08-20 17:37:32 INFO: [load_embeddings] paths shape: (115590,) 2025-08-20 17:37:32 INFO: [load_embeddings] multiplex=False 2025-08-20 17:37:32 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:37:32 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 17:37:32 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model 2025-08-20 17:37:35 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:37:35 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:37:36 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:37:36 INFO: [load_embeddings] embeddings shape: (94059, 192) 2025-08-20 17:37:36 INFO: [load_embeddings] labels shape: (94059,) 2025-08-20 17:37:36 INFO: [load_embeddings] example label: DAPI_WT_Untreated 2025-08-20 17:37:36 INFO: [load_embeddings] paths shape: (94059,) 2025-08-20 17:37:36 INFO: [load_embeddings] multiplex=False 2025-08-20 17:37:36 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:37:36 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 17:37:36 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model 2025-08-20 17:37:39 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:37:39 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:37:40 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:37:40 INFO: [load_embeddings] embeddings shape: (87130, 192) 2025-08-20 17:37:40 INFO: [load_embeddings] labels shape: (87130,) 2025-08-20 17:37:40 INFO: [load_embeddings] example label: MitoTracker_WT_Untreated 2025-08-20 17:37:40 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].
=== Batch [2] ===
Train: (115590, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (94059, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
precision recall f1-score support
0 0.91 0.94 0.92 2123
1 0.99 0.95 0.97 2536
2 0.97 0.94 0.96 2079
3 1.00 1.00 1.00 24823
4 0.89 0.82 0.85 2319
5 0.85 0.88 0.87 2608
6 0.98 0.92 0.95 2236
7 0.96 0.98 0.97 2265
8 0.97 0.96 0.96 2110
9 0.89 0.87 0.88 2104
10 0.95 0.96 0.96 2243
11 0.97 0.97 0.97 2236
12 0.99 0.99 0.99 2227
13 0.95 0.93 0.94 2360
14 0.94 0.92 0.93 1916
15 0.96 0.97 0.96 2074
16 0.92 0.85 0.89 1818
17 0.88 0.88 0.88 1631
18 0.95 0.97 0.96 2090
19 0.98 0.91 0.95 2019
20 0.90 0.94 0.92 1923
21 0.76 0.85 0.80 1654
22 0.87 0.92 0.89 1934
23 0.91 0.94 0.93 2086
24 0.97 0.97 0.97 2114
25 0.99 1.00 0.99 18531
accuracy 0.96 94059
macro avg 0.93 0.93 0.93 94059
weighted avg 0.96 0.96 0.96 94059
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (115590, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (87130, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
precision recall f1-score support
0 0.83 0.88 0.85 1850
1 0.96 0.51 0.67 2044
2 0.97 0.96 0.97 2332
3 0.99 1.00 0.99 22599
4 0.76 0.62 0.68 1901
5 0.48 0.41 0.44 1492
6 0.97 0.80 0.88 2095
7 0.81 0.96 0.88 2384
8 0.94 0.97 0.95 2145
9 0.96 0.84 0.89 2358
10 0.95 0.97 0.96 2340
11 0.94 0.94 0.94 2095
12 0.98 0.97 0.98 2085
13 0.82 0.99 0.89 2117
14 0.94 0.89 0.92 1751
15 0.97 0.91 0.94 1855
16 0.67 0.50 0.58 1623
17 0.66 0.71 0.69 1903
18 0.87 0.98 0.92 2085
19 0.98 0.91 0.95 2152
20 0.65 0.92 0.76 1857
21 0.80 0.54 0.64 1484
22 0.89 0.94 0.92 1836
23 0.84 0.94 0.89 2078
24 0.95 0.98 0.96 2200
25 0.95 0.99 0.97 16469
accuracy 0.91 87130
macro avg 0.87 0.85 0.85 87130
weighted avg 0.92 0.91 0.91 87130
=== Overall Accuracy ===
0.9370611776259995 [0.9599825641352768, 0.9141397911167222]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.995060 0.908130 0.997009 0.871919 0.997938
CLTC_WT_Untreated 0.993322 0.752402 0.999570 0.978421 0.993617
Calreticulin_WT_Untreated 0.998184 0.952618 0.999321 0.972235 0.998818
DAPI_WT_Untreated 0.998471 0.999895 0.997967 0.994296 0.999963
DCP1A_WT_Untreated 0.990336 0.731043 0.996519 0.833558 0.993605
FMRP_WT_Untreated 0.987499 0.707561 0.993980 0.731283 0.993234
FUS_WT_Untreated 0.996137 0.859617 0.999480 0.975885 0.996572
G3BP1_WT_Untreated 0.995690 0.968595 0.996403 0.876411 0.999171
GM130_WT_Untreated 0.998046 0.963572 0.998875 0.953710 0.999124
KIF5A_WT_Untreated 0.994740 0.852757 0.998325 0.927822 0.996290
LAMP1_WT_Untreated 0.997792 0.963125 0.998692 0.950269 0.999043
MitoTracker_WT_Untreated 0.997892 0.958439 0.998858 0.953595 0.998982
NCL_WT_Untreated 0.999200 0.980056 0.999666 0.986231 0.999514
NEMO_WT_Untreated 0.995673 0.957338 0.996644 0.878459 0.998917
P54_WT_Untreated 0.996926 0.906463 0.998795 0.939514 0.998069
PEX14_WT_Untreated 0.997936 0.939425 0.999233 0.964463 0.998658
PML_WT_Untreated 0.991125 0.687591 0.997001 0.816143 0.993970
PSD95_WT_Untreated 0.991026 0.788908 0.995047 0.760087 0.995798
PURA_WT_Untreated 0.997119 0.973892 0.997667 0.907792 0.999383
Phalloidin_WT_Untreated 0.997660 0.914889 0.999610 0.982239 0.997998
SNCA_WT_Untreated 0.992362 0.930952 0.993670 0.758078 0.998522
SQSTM1_WT_Untreated 0.991285 0.700446 0.996411 0.774762 0.994730
TDP43_WT_Untreated 0.995921 0.930239 0.997317 0.880492 0.998516
TIA1_WT_Untreated 0.995496 0.943324 0.996724 0.871340 0.998664
TOMM20_WT_Untreated 0.998383 0.975661 0.998937 0.957244 0.999406
TUJ1_WT_Untreated 0.992593 0.990429 0.993112 0.971771 0.997698
Macro Average 0.995226 0.893745 0.997494 0.902616 0.997546
{'Accuracy': 0.9952259795020667,
'Sensitivity': 0.8937447502194574,
'Specificity': 0.9974935796886114,
'PPV': 0.9026161965801902,
'NPV': 0.9975461538416751}
## Baseline
run_baseline_model(
dataset_config= pretrained_dataset_config,
batches=[1, 2, 3],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=GaussianNB,
classifier_kwargs={},
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 17:37:47 INFO: [load_embeddings] multiplex=False 2025-08-20 17:37:47 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:37:47 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 17:37:47 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
Loading all batches...
2025-08-20 17:37:49 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:37:50 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:37:50 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:37:51 INFO: [load_embeddings] embeddings shape: (115590, 192) 2025-08-20 17:37:51 INFO: [load_embeddings] labels shape: (115590,) 2025-08-20 17:37:51 INFO: [load_embeddings] example label: CLTC_WT_Untreated 2025-08-20 17:37:51 INFO: [load_embeddings] paths shape: (115590,) 2025-08-20 17:37:51 INFO: [load_embeddings] multiplex=False 2025-08-20 17:37:51 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:37:51 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 17:37:51 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model 2025-08-20 17:37:52 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:37:53 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:37:53 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:37:53 INFO: [load_embeddings] embeddings shape: (94059, 192) 2025-08-20 17:37:53 INFO: [load_embeddings] labels shape: (94059,) 2025-08-20 17:37:53 INFO: [load_embeddings] example label: DAPI_WT_Untreated 2025-08-20 17:37:53 INFO: [load_embeddings] paths shape: (94059,) 2025-08-20 17:37:53 INFO: [load_embeddings] multiplex=False 2025-08-20 17:37:53 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:37:53 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 17:37:53 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model 2025-08-20 17:37:55 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:37:55 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:37:56 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:37:56 INFO: [load_embeddings] embeddings shape: (87130, 192) 2025-08-20 17:37:56 INFO: [load_embeddings] labels shape: (87130,) 2025-08-20 17:37:56 INFO: [load_embeddings] example label: MitoTracker_WT_Untreated 2025-08-20 17:37:56 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].
=== Batch [2] ===
Train: (115590, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (94059, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
precision recall f1-score support
0 0.82 0.85 0.83 2123
1 0.93 0.90 0.91 2536
2 0.86 0.87 0.86 2079
3 1.00 1.00 1.00 24823
4 0.77 0.67 0.72 2319
5 0.73 0.72 0.72 2608
6 0.96 0.86 0.90 2236
7 0.81 0.96 0.88 2265
8 0.91 0.88 0.89 2110
9 0.79 0.85 0.82 2104
10 0.83 0.88 0.85 2243
11 0.86 0.95 0.91 2236
12 0.94 0.98 0.96 2227
13 0.92 0.86 0.89 2360
14 0.87 0.79 0.83 1916
15 0.90 0.91 0.90 2074
16 0.71 0.71 0.71 1818
17 0.77 0.81 0.79 1631
18 0.90 0.92 0.91 2090
19 0.92 0.90 0.91 2019
20 0.75 0.90 0.82 1923
21 0.70 0.66 0.68 1654
22 0.76 0.81 0.79 1934
23 0.80 0.89 0.84 2086
24 0.89 0.88 0.89 2114
25 0.99 0.96 0.97 18531
accuracy 0.91 94059
macro avg 0.85 0.86 0.85 94059
weighted avg 0.91 0.91 0.91 94059
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (115590, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (87130, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
precision recall f1-score support
0 0.76 0.79 0.77 1850
1 0.81 0.46 0.59 2044
2 0.89 0.89 0.89 2332
3 0.99 1.00 0.99 22599
4 0.64 0.38 0.47 1901
5 0.19 0.12 0.15 1492
6 0.91 0.70 0.79 2095
7 0.67 0.96 0.79 2384
8 0.90 0.87 0.88 2145
9 0.92 0.81 0.87 2358
10 0.78 0.90 0.84 2340
11 0.76 0.92 0.83 2095
12 0.93 0.97 0.95 2085
13 0.83 0.95 0.89 2117
14 0.88 0.74 0.80 1751
15 0.87 0.72 0.79 1855
16 0.48 0.43 0.45 1623
17 0.61 0.62 0.61 1903
18 0.82 0.95 0.88 2085
19 0.94 0.89 0.92 2152
20 0.48 0.94 0.63 1857
21 0.85 0.47 0.61 1484
22 0.78 0.85 0.82 1836
23 0.75 0.91 0.82 2078
24 0.88 0.89 0.89 2200
25 0.97 0.93 0.95 16469
accuracy 0.86 87130
macro avg 0.78 0.77 0.76 87130
weighted avg 0.87 0.86 0.86 87130
=== Overall Accuracy ===
0.8862662487311861 [0.9105561402949213, 0.8619763571674509]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.991203 0.818273 0.995079 0.788504 0.995922
CLTC_WT_Untreated 0.990226 0.700873 0.997729 0.888950 0.992285
Calreticulin_WT_Untreated 0.993918 0.880299 0.996753 0.871214 0.997012
DAPI_WT_Untreated 0.997196 0.995087 0.997944 0.994206 0.998258
DCP1A_WT_Untreated 0.984381 0.540047 0.994977 0.719381 0.989097
FMRP_WT_Untreated 0.980744 0.499512 0.991885 0.587661 0.988453
FUS_WT_Untreated 0.993388 0.779266 0.998632 0.933094 0.994616
G3BP1_WT_Untreated 0.989972 0.961282 0.990727 0.731903 0.998972
GM130_WT_Untreated 0.994818 0.872385 0.997762 0.903603 0.996934
KIF5A_WT_Untreated 0.992290 0.829673 0.996396 0.853192 0.995703
LAMP1_WT_Untreated 0.991710 0.890901 0.994326 0.802950 0.997161
MitoTracker_WT_Untreated 0.993289 0.938582 0.994628 0.810568 0.998490
NCL_WT_Untreated 0.997825 0.976809 0.998338 0.934754 0.999434
NEMO_WT_Untreated 0.994497 0.904847 0.996769 0.876460 0.997587
P54_WT_Untreated 0.993068 0.764930 0.997781 0.876837 0.995157
PEX14_WT_Untreated 0.993846 0.817256 0.997760 0.889967 0.995957
PML_WT_Untreated 0.984828 0.575414 0.992754 0.605875 0.991788
PSD95_WT_Untreated 0.987935 0.707980 0.993504 0.684354 0.994187
PURA_WT_Untreated 0.995060 0.937725 0.996413 0.860440 0.998528
Phalloidin_WT_Untreated 0.996065 0.895708 0.998430 0.930742 0.997545
SNCA_WT_Untreated 0.984651 0.920106 0.986027 0.583851 0.998277
SQSTM1_WT_Untreated 0.989337 0.570108 0.996726 0.754216 0.992456
TDP43_WT_Untreated 0.991374 0.828912 0.994826 0.772941 0.996359
TIA1_WT_Untreated 0.991572 0.896494 0.993809 0.773038 0.997556
TOMM20_WT_Untreated 0.994641 0.888503 0.997230 0.886653 0.997280
TUJ1_WT_Untreated 0.986555 0.946114 0.996238 0.983662 0.987216
Macro Average 0.991323 0.820657 0.995517 0.819193 0.995470
{'Accuracy': 0.991322702982903,
'Sensitivity': 0.8206572429223186,
'Specificity': 0.9955169763968997,
'PPV': 0.81919290735299,
'NPV': 0.9954703683087419}
## Baseline
run_baseline_model(
dataset_config= pretrained_dataset_config,
batches=[1, 2, 3],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=RidgeClassifier,
classifier_kwargs={},
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 17:38:02 INFO: [load_embeddings] multiplex=False 2025-08-20 17:38:02 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:38:02 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 17:38:02 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
Loading all batches...
2025-08-20 17:38:04 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:38:05 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:38:05 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:38:06 INFO: [load_embeddings] embeddings shape: (115590, 192) 2025-08-20 17:38:06 INFO: [load_embeddings] labels shape: (115590,) 2025-08-20 17:38:06 INFO: [load_embeddings] example label: CLTC_WT_Untreated 2025-08-20 17:38:06 INFO: [load_embeddings] paths shape: (115590,) 2025-08-20 17:38:06 INFO: [load_embeddings] multiplex=False 2025-08-20 17:38:06 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:38:06 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 17:38:06 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model 2025-08-20 17:38:07 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:38:07 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:38:08 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:38:08 INFO: [load_embeddings] embeddings shape: (94059, 192) 2025-08-20 17:38:08 INFO: [load_embeddings] labels shape: (94059,) 2025-08-20 17:38:08 INFO: [load_embeddings] example label: DAPI_WT_Untreated 2025-08-20 17:38:08 INFO: [load_embeddings] paths shape: (94059,) 2025-08-20 17:38:08 INFO: [load_embeddings] multiplex=False 2025-08-20 17:38:08 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:38:08 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 17:38:08 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model 2025-08-20 17:38:09 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:38:10 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:38:10 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:38:11 INFO: [load_embeddings] embeddings shape: (87130, 192) 2025-08-20 17:38:11 INFO: [load_embeddings] labels shape: (87130,) 2025-08-20 17:38:11 INFO: [load_embeddings] example label: MitoTracker_WT_Untreated 2025-08-20 17:38:11 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].
=== Batch [2] ===
Train: (115590, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (94059, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
precision recall f1-score support
0 0.83 0.86 0.85 2123
1 0.98 0.82 0.89 2536
2 0.88 0.88 0.88 2079
3 0.99 1.00 0.99 24823
4 0.89 0.72 0.80 2319
5 0.79 0.84 0.81 2608
6 0.95 0.90 0.92 2236
7 0.93 0.95 0.94 2265
8 0.93 0.94 0.94 2110
9 0.86 0.83 0.85 2104
10 0.89 0.94 0.92 2243
11 0.96 0.95 0.96 2236
12 0.99 0.97 0.98 2227
13 0.89 0.93 0.91 2360
14 0.84 0.90 0.87 1916
15 0.93 0.94 0.94 2074
16 0.92 0.66 0.77 1818
17 0.87 0.83 0.84 1631
18 0.90 0.95 0.93 2090
19 0.97 0.80 0.88 2019
20 0.84 0.91 0.87 1923
21 0.78 0.74 0.76 1654
22 0.83 0.80 0.81 1934
23 0.87 0.89 0.88 2086
24 0.95 0.93 0.94 2114
25 0.95 1.00 0.97 18531
accuracy 0.93 94059
macro avg 0.90 0.88 0.89 94059
weighted avg 0.93 0.93 0.93 94059
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (115590, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (87130, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
precision recall f1-score support
0 0.80 0.77 0.78 1850
1 0.93 0.30 0.45 2044
2 0.95 0.92 0.93 2332
3 0.97 1.00 0.99 22599
4 0.77 0.39 0.52 1901
5 0.31 0.27 0.29 1492
6 0.93 0.74 0.82 2095
7 0.81 0.91 0.86 2384
8 0.92 0.94 0.93 2145
9 0.93 0.78 0.85 2358
10 0.90 0.95 0.92 2340
11 0.92 0.92 0.92 2095
12 0.98 0.96 0.97 2085
13 0.69 0.99 0.81 2117
14 0.84 0.85 0.85 1751
15 0.94 0.89 0.91 1855
16 0.66 0.40 0.50 1623
17 0.63 0.69 0.66 1903
18 0.83 0.97 0.89 2085
19 0.97 0.83 0.90 2152
20 0.55 0.82 0.66 1857
21 0.79 0.38 0.51 1484
22 0.85 0.87 0.86 1836
23 0.80 0.88 0.84 2078
24 0.90 0.95 0.92 2200
25 0.91 0.98 0.94 16469
accuracy 0.88 87130
macro avg 0.83 0.78 0.79 87130
weighted avg 0.88 0.88 0.87 87130
=== Overall Accuracy ===
0.9040429000423724 [0.9303628573554896, 0.8777229427292551]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.991992 0.820539 0.995836 0.815408 0.995976
CLTC_WT_Untreated 0.988984 0.584934 0.999462 0.965753 0.989345
Calreticulin_WT_Untreated 0.995541 0.899796 0.997930 0.915571 0.997501
DAPI_WT_Untreated 0.994906 1.000000 0.993100 0.980908 1.000000
DCP1A_WT_Untreated 0.987665 0.573460 0.997542 0.847636 0.989907
FMRP_WT_Untreated 0.983388 0.630976 0.991547 0.633448 0.991457
FUS_WT_Untreated 0.994409 0.818518 0.998716 0.939820 0.995570
G3BP1_WT_Untreated 0.994508 0.928157 0.996256 0.867162 0.998105
GM130_WT_Untreated 0.996827 0.939365 0.998208 0.926518 0.998541
KIF5A_WT_Untreated 0.992864 0.803227 0.997652 0.896224 0.995045
LAMP1_WT_Untreated 0.995883 0.948287 0.997118 0.895160 0.998656
MitoTracker_WT_Untreated 0.997053 0.939737 0.998456 0.937140 0.998524
NCL_WT_Untreated 0.998874 0.964750 0.999706 0.987654 0.999141
NEMO_WT_Untreated 0.992257 0.960241 0.993068 0.778240 0.998987
P54_WT_Untreated 0.994111 0.875648 0.996558 0.840136 0.997429
PEX14_WT_Untreated 0.996843 0.920845 0.998528 0.932715 0.998246
PML_WT_Untreated 0.988835 0.538797 0.997547 0.809607 0.991129
PSD95_WT_Untreated 0.989828 0.752405 0.994551 0.733113 0.995072
PURA_WT_Untreated 0.995596 0.961198 0.996407 0.863196 0.999082
Phalloidin_WT_Untreated 0.995182 0.813474 0.999463 0.972764 0.995622
SNCA_WT_Untreated 0.988371 0.866667 0.990964 0.671449 0.997141
SQSTM1_WT_Untreated 0.989751 0.568834 0.997169 0.779817 0.992437
TDP43_WT_Untreated 0.993140 0.832361 0.996556 0.837023 0.996438
TIA1_WT_Untreated 0.993294 0.884726 0.995848 0.833673 0.997285
TOMM20_WT_Untreated 0.996711 0.941122 0.998066 0.922308 0.998563
TUJ1_WT_Untreated 0.983288 0.989743 0.981743 0.928464 0.997505
Macro Average 0.992696 0.836839 0.996077 0.865804 0.996258
{'Accuracy': 0.9926961095023175,
'Sensitivity': 0.8368386339245435,
'Specificity': 0.9960768450269657,
'PPV': 0.8658041113997191,
'NPV': 0.9962578451271408}
## Baseline
run_baseline_model(
dataset_config= pretrained_dataset_config,
batches=[1, 2, 3],
balance=False,
norm=False,
choose_features=False,
top_k=100,
label_map=None,
classifier_class=LinearSVC,
classifier_kwargs={"C": 1.0, "max_iter": 1000, "random_state": 42},
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 17:38:14 INFO: [load_embeddings] multiplex=False 2025-08-20 17:38:14 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:38:14 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 17:38:14 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
Loading all batches...
2025-08-20 17:38:16 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:38:17 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:38:17 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:38:17 INFO: [load_embeddings] embeddings shape: (115590, 192) 2025-08-20 17:38:17 INFO: [load_embeddings] labels shape: (115590,) 2025-08-20 17:38:17 INFO: [load_embeddings] example label: CLTC_WT_Untreated 2025-08-20 17:38:17 INFO: [load_embeddings] paths shape: (115590,) 2025-08-20 17:38:17 INFO: [load_embeddings] multiplex=False 2025-08-20 17:38:17 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:38:17 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 17:38:17 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model 2025-08-20 17:38:19 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:38:19 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:38:20 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:38:20 INFO: [load_embeddings] embeddings shape: (94059, 192) 2025-08-20 17:38:20 INFO: [load_embeddings] labels shape: (94059,) 2025-08-20 17:38:20 INFO: [load_embeddings] example label: DAPI_WT_Untreated 2025-08-20 17:38:20 INFO: [load_embeddings] paths shape: (94059,) 2025-08-20 17:38:20 INFO: [load_embeddings] multiplex=False 2025-08-20 17:38:20 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:38:20 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 17:38:20 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model 2025-08-20 17:38:21 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:38:22 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:38:22 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:38:22 INFO: [load_embeddings] embeddings shape: (87130, 192) 2025-08-20 17:38:22 INFO: [load_embeddings] labels shape: (87130,) 2025-08-20 17:38:22 INFO: [load_embeddings] example label: MitoTracker_WT_Untreated 2025-08-20 17:38:22 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded. Training on Batches: [1], Testing on: [2]. === Batch [2] === Train: (115590, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25] Test: (94059, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25] CLTC_WT_Untreated: 2439 DAPI_WT_Untreated: 30429 GM130_WT_Untreated: 2371 PURA_WT_Untreated: 2712 PSD95_WT_Untreated: 2101 SQSTM1_WT_Untreated: 2651 SNCA_WT_Untreated: 2454 G3BP1_WT_Untreated: 2842 PEX14_WT_Untreated: 2505 KIF5A_WT_Untreated: 2622 NEMO_WT_Untreated: 2935 MitoTracker_WT_Untreated: 2728 TIA1_WT_Untreated: 2712 NCL_WT_Untreated: 2709 TUJ1_WT_Untreated: 22601 P54_WT_Untreated: 2623 ANAX11_WT_Untreated: 2614 DCP1A_WT_Untreated: 2364 TOMM20_WT_Untreated: 2363 FMRP_WT_Untreated: 2913 FUS_WT_Untreated: 2728 PML_WT_Untreated: 2297 Calreticulin_WT_Untreated: 3056 TDP43_WT_Untreated: 2535 Phalloidin_WT_Untreated: 2219 LAMP1_WT_Untreated: 3067
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning. warnings.warn(
precision recall f1-score support
0 0.93 0.95 0.94 2123
1 0.99 0.96 0.97 2536
2 0.97 0.96 0.97 2079
3 1.00 1.00 1.00 24823
4 0.91 0.85 0.88 2319
5 0.87 0.90 0.88 2608
6 0.98 0.94 0.96 2236
7 0.97 0.98 0.98 2265
8 0.97 0.98 0.98 2110
9 0.91 0.87 0.89 2104
10 0.97 0.97 0.97 2243
11 0.97 0.98 0.97 2236
12 0.99 1.00 1.00 2227
13 0.96 0.94 0.95 2360
14 0.94 0.94 0.94 1916
15 0.97 0.97 0.97 2074
16 0.93 0.89 0.91 1818
17 0.89 0.88 0.89 1631
18 0.96 0.98 0.97 2090
19 0.99 0.94 0.96 2019
20 0.91 0.95 0.93 1923
21 0.77 0.85 0.81 1654
22 0.89 0.92 0.90 1934
23 0.92 0.95 0.93 2086
24 0.98 0.98 0.98 2114
25 0.99 1.00 0.99 18531
accuracy 0.97 94059
macro avg 0.94 0.94 0.94 94059
weighted avg 0.97 0.97 0.97 94059
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (115590, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (87130, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/sklearn/svm/_classes.py:32: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning. warnings.warn(
precision recall f1-score support
0 0.88 0.89 0.89 1850
1 0.96 0.55 0.70 2044
2 0.97 0.98 0.97 2332
3 1.00 1.00 1.00 22599
4 0.78 0.66 0.72 1901
5 0.48 0.38 0.42 1492
6 0.98 0.89 0.93 2095
7 0.82 0.97 0.89 2384
8 0.93 0.97 0.95 2145
9 0.97 0.82 0.89 2358
10 0.97 0.97 0.97 2340
11 0.95 0.96 0.96 2095
12 0.97 0.99 0.98 2085
13 0.84 0.99 0.91 2117
14 0.94 0.91 0.93 1751
15 0.97 0.92 0.94 1855
16 0.67 0.55 0.60 1623
17 0.64 0.72 0.68 1903
18 0.88 0.99 0.93 2085
19 0.98 0.94 0.96 2152
20 0.67 0.94 0.78 1857
21 0.74 0.48 0.58 1484
22 0.91 0.94 0.93 1836
23 0.85 0.95 0.90 2078
24 0.96 0.99 0.98 2200
25 0.96 0.99 0.98 16469
accuracy 0.92 87130
macro avg 0.87 0.86 0.86 87130
weighted avg 0.92 0.92 0.92 87130
=== Overall Accuracy ===
0.9445453416512052 [0.9668824886507404, 0.9222081946516699]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.996313 0.926001 0.997890 0.907723 0.998340
CLTC_WT_Untreated 0.993940 0.776856 0.999570 0.979086 0.994244
Calreticulin_WT_Untreated 0.998587 0.968488 0.999338 0.973342 0.999214
DAPI_WT_Untreated 0.999404 0.999937 0.999215 0.997791 0.999978
DCP1A_WT_Untreated 0.991407 0.763507 0.996841 0.852156 0.994375
FMRP_WT_Untreated 0.988040 0.708293 0.994517 0.749419 0.993255
FUS_WT_Untreated 0.997494 0.914108 0.999536 0.979708 0.997900
G3BP1_WT_Untreated 0.996275 0.978920 0.996732 0.887480 0.999443
GM130_WT_Untreated 0.998306 0.976028 0.998841 0.952960 0.999423
KIF5A_WT_Untreated 0.994851 0.843568 0.998670 0.941235 0.996061
LAMP1_WT_Untreated 0.998444 0.971634 0.999139 0.966992 0.999264
MitoTracker_WT_Untreated 0.998377 0.972062 0.999022 0.960529 0.999316
NCL_WT_Untreated 0.999393 0.991187 0.999593 0.983433 0.999785
NEMO_WT_Untreated 0.996407 0.961358 0.997295 0.900042 0.999019
P54_WT_Untreated 0.997351 0.923643 0.998873 0.944243 0.998423
PEX14_WT_Untreated 0.998273 0.946806 0.999413 0.972803 0.998822
PML_WT_Untreated 0.991749 0.727114 0.996872 0.818182 0.994729
PSD95_WT_Untreated 0.990877 0.796265 0.994748 0.751001 0.995942
PURA_WT_Untreated 0.997583 0.982275 0.997944 0.918477 0.999581
Phalloidin_WT_Untreated 0.998256 0.939343 0.999644 0.984175 0.998572
SNCA_WT_Untreated 0.993079 0.945238 0.994098 0.773377 0.998828
SQSTM1_WT_Untreated 0.990756 0.678776 0.996254 0.761530 0.994350
TDP43_WT_Untreated 0.996396 0.929178 0.997824 0.900746 0.998494
TIA1_WT_Untreated 0.995982 0.950048 0.997063 0.883825 0.998823
TOMM20_WT_Untreated 0.999018 0.986555 0.999322 0.972578 0.999672
TUJ1_WT_Untreated 0.994244 0.991829 0.994822 0.978659 0.998037
Macro Average 0.995800 0.905731 0.997811 0.911211 0.997842
{'Accuracy': 0.9957999657815871,
'Sensitivity': 0.9057314282550202,
'Specificity': 0.9978106343982152,
'PPV': 0.9112112336388931,
'NPV': 0.9978418992101846}
run_baseline_model(
pretrained_dataset_config,
batches=[1,2,3,],
classifier_class=cuRF,
classifier_kwargs={"n_estimators": 300, "random_state": 42},
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 17:39:02 INFO: [load_embeddings] multiplex=False 2025-08-20 17:39:02 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:39:02 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 17:39:02 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
Loading all batches...
2025-08-20 17:39:04 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:39:05 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:39:06 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:39:06 INFO: [load_embeddings] embeddings shape: (115590, 192) 2025-08-20 17:39:06 INFO: [load_embeddings] labels shape: (115590,) 2025-08-20 17:39:06 INFO: [load_embeddings] example label: CLTC_WT_Untreated 2025-08-20 17:39:06 INFO: [load_embeddings] paths shape: (115590,) 2025-08-20 17:39:06 INFO: [load_embeddings] multiplex=False 2025-08-20 17:39:06 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:39:06 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 17:39:06 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model 2025-08-20 17:39:07 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:39:08 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:39:08 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:39:08 INFO: [load_embeddings] embeddings shape: (94059, 192) 2025-08-20 17:39:08 INFO: [load_embeddings] labels shape: (94059,) 2025-08-20 17:39:08 INFO: [load_embeddings] example label: DAPI_WT_Untreated 2025-08-20 17:39:08 INFO: [load_embeddings] paths shape: (94059,) 2025-08-20 17:39:08 INFO: [load_embeddings] multiplex=False 2025-08-20 17:39:08 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:39:08 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 17:39:08 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model 2025-08-20 17:39:10 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:39:10 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:39:11 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:39:11 INFO: [load_embeddings] embeddings shape: (87130, 192) 2025-08-20 17:39:11 INFO: [load_embeddings] labels shape: (87130,) 2025-08-20 17:39:11 INFO: [load_embeddings] example label: MitoTracker_WT_Untreated 2025-08-20 17:39:11 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded. Training on Batches: [1], Testing on: [2]. === Batch [2] === Train: (115590, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25] Test: (94059, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25] CLTC_WT_Untreated: 2439 DAPI_WT_Untreated: 30429 GM130_WT_Untreated: 2371 PURA_WT_Untreated: 2712 PSD95_WT_Untreated: 2101 SQSTM1_WT_Untreated: 2651 SNCA_WT_Untreated: 2454 G3BP1_WT_Untreated: 2842 PEX14_WT_Untreated: 2505 KIF5A_WT_Untreated: 2622 NEMO_WT_Untreated: 2935 MitoTracker_WT_Untreated: 2728 TIA1_WT_Untreated: 2712 NCL_WT_Untreated: 2709 TUJ1_WT_Untreated: 22601 P54_WT_Untreated: 2623 ANAX11_WT_Untreated: 2614 DCP1A_WT_Untreated: 2364 TOMM20_WT_Untreated: 2363 FMRP_WT_Untreated: 2913 FUS_WT_Untreated: 2728 PML_WT_Untreated: 2297 Calreticulin_WT_Untreated: 3056 TDP43_WT_Untreated: 2535 Phalloidin_WT_Untreated: 2219 LAMP1_WT_Untreated: 3067
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/cuml/internals/api_decorators.py:344: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set return func(**kwargs)
precision recall f1-score support
0 0.89 0.92 0.91 2123
1 0.98 0.89 0.94 2536
2 0.94 0.92 0.93 2079
3 1.00 1.00 1.00 24823
4 0.86 0.74 0.80 2319
5 0.79 0.85 0.81 2608
6 0.98 0.90 0.94 2236
7 0.95 0.97 0.96 2265
8 0.96 0.95 0.96 2110
9 0.87 0.87 0.87 2104
10 0.89 0.95 0.92 2243
11 0.96 0.96 0.96 2236
12 0.99 0.98 0.99 2227
13 0.95 0.92 0.93 2360
14 0.92 0.88 0.90 1916
15 0.95 0.94 0.95 2074
16 0.89 0.80 0.85 1818
17 0.88 0.84 0.86 1631
18 0.93 0.96 0.94 2090
19 0.98 0.87 0.92 2019
20 0.86 0.93 0.89 1923
21 0.76 0.78 0.77 1654
22 0.83 0.90 0.86 1934
23 0.88 0.93 0.91 2086
24 0.97 0.94 0.95 2114
25 0.97 0.99 0.98 18531
accuracy 0.95 94059
macro avg 0.92 0.91 0.91 94059
weighted avg 0.95 0.95 0.95 94059
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (115590, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (87130, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
/home/projects/hornsteinlab/galavir/.conda/envs/nova/lib/python3.9/site-packages/cuml/internals/api_decorators.py:344: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams=1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set return func(**kwargs)
precision recall f1-score support
0 0.85 0.85 0.85 1850
1 0.93 0.38 0.54 2044
2 0.95 0.95 0.95 2332
3 0.98 1.00 0.99 22599
4 0.71 0.50 0.59 1901
5 0.32 0.26 0.29 1492
6 0.94 0.76 0.84 2095
7 0.82 0.92 0.87 2384
8 0.94 0.94 0.94 2145
9 0.96 0.83 0.89 2358
10 0.88 0.96 0.92 2340
11 0.94 0.92 0.93 2095
12 0.98 0.98 0.98 2085
13 0.80 0.98 0.88 2117
14 0.93 0.87 0.90 1751
15 0.96 0.84 0.90 1855
16 0.63 0.48 0.55 1623
17 0.70 0.66 0.68 1903
18 0.85 0.97 0.91 2085
19 0.97 0.89 0.93 2152
20 0.57 0.92 0.70 1857
21 0.89 0.61 0.72 1484
22 0.87 0.93 0.90 1836
23 0.83 0.95 0.88 2078
24 0.95 0.95 0.95 2200
25 0.93 0.99 0.96 16469
accuracy 0.90 87130
macro avg 0.85 0.82 0.82 87130
weighted avg 0.90 0.90 0.89 87130
=== Overall Accuracy ===
0.9220545504680028 [0.9459913458573874, 0.8981177550786181]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.994757 0.889756 0.997111 0.873487 0.997527
CLTC_WT_Untreated 0.990943 0.663100 0.999445 0.968740 0.991334
Calreticulin_WT_Untreated 0.997080 0.933575 0.998665 0.945797 0.998343
DAPI_WT_Untreated 0.997356 0.999937 0.996442 0.990062 0.999977
DCP1A_WT_Untreated 0.987759 0.633886 0.996197 0.798984 0.991312
FMRP_WT_Untreated 0.983895 0.632195 0.992038 0.647676 0.991489
FUS_WT_Untreated 0.995248 0.833988 0.999197 0.962174 0.995948
G3BP1_WT_Untreated 0.995314 0.944719 0.996647 0.881220 0.998541
GM130_WT_Untreated 0.997583 0.946886 0.998802 0.950012 0.998723
KIF5A_WT_Untreated 0.994459 0.852981 0.998031 0.916225 0.996295
LAMP1_WT_Untreated 0.995723 0.955706 0.996761 0.884491 0.998848
MitoTracker_WT_Untreated 0.997378 0.939506 0.998796 0.950257 0.998519
NCL_WT_Untreated 0.999244 0.979128 0.999734 0.988990 0.999491
NEMO_WT_Untreated 0.995226 0.949073 0.996395 0.869628 0.998707
P54_WT_Untreated 0.995966 0.872375 0.998518 0.924032 0.997367
PEX14_WT_Untreated 0.996788 0.896411 0.999013 0.952664 0.997707
PML_WT_Untreated 0.989878 0.653298 0.996394 0.778124 0.993309
PSD95_WT_Untreated 0.990987 0.739672 0.995987 0.785693 0.994827
PURA_WT_Untreated 0.996473 0.965030 0.997215 0.890977 0.999174
Phalloidin_WT_Untreated 0.996689 0.878207 0.999480 0.975499 0.997137
SNCA_WT_Untreated 0.989519 0.924074 0.990914 0.684231 0.998370
SQSTM1_WT_Untreated 0.991898 0.701083 0.997023 0.805861 0.994744
TDP43_WT_Untreated 0.994845 0.913528 0.996573 0.849951 0.998160
TIA1_WT_Untreated 0.994933 0.937800 0.996277 0.855609 0.998534
TOMM20_WT_Untreated 0.997831 0.946222 0.999090 0.962055 0.998689
TUJ1_WT_Untreated 0.988167 0.991371 0.987400 0.949589 0.997912
Macro Average 0.994075 0.868212 0.996852 0.886232 0.996961
{'Accuracy': 0.9940746105745085,
'Sensitivity': 0.8682118440730283,
'Specificity': 0.9968516972810667,
'PPV': 0.8862318807161186,
'NPV': 0.9969609440727576}
run_baseline_model(pretrained_dataset_config,
batches=[1,2,3],
classifier_class=ExtraTreesClassifier,
classifier_kwargs={"n_estimators": 300, "max_depth": None, "min_samples_leaf": 1,
"n_jobs": -1, "random_state": 42},
train_specific_batches = [1],
results_csv = 'classification_results-NIH.csv'
)
2025-08-20 17:39:48 INFO: [load_embeddings] multiplex=False 2025-08-20 17:39:48 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:39:48 INFO: [load_embeddings] input_folders = ['batch1'] 2025-08-20 17:39:48 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model
Loading all batches...
2025-08-20 17:39:50 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:39:51 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:39:51 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:39:52 INFO: [load_embeddings] embeddings shape: (115590, 192) 2025-08-20 17:39:52 INFO: [load_embeddings] labels shape: (115590,) 2025-08-20 17:39:52 INFO: [load_embeddings] example label: CLTC_WT_Untreated 2025-08-20 17:39:52 INFO: [load_embeddings] paths shape: (115590,) 2025-08-20 17:39:52 INFO: [load_embeddings] multiplex=False 2025-08-20 17:39:52 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:39:52 INFO: [load_embeddings] input_folders = ['batch2'] 2025-08-20 17:39:52 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model 2025-08-20 17:39:53 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:39:54 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:39:54 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:39:54 INFO: [load_embeddings] embeddings shape: (94059, 192) 2025-08-20 17:39:54 INFO: [load_embeddings] labels shape: (94059,) 2025-08-20 17:39:54 INFO: [load_embeddings] example label: DAPI_WT_Untreated 2025-08-20 17:39:54 INFO: [load_embeddings] paths shape: (94059,) 2025-08-20 17:39:54 INFO: [load_embeddings] multiplex=False 2025-08-20 17:39:54 INFO: [load_embeddings] experiment_type = NIH 2025-08-20 17:39:54 INFO: [load_embeddings] input_folders = ['batch3'] 2025-08-20 17:39:54 INFO: [load_embeddings] model_output_folder = /home/projects/hornsteinlab/Collaboration/NOVA/outputs/vit_models/pretrained_model 2025-08-20 17:39:56 INFO: [embeddings_utils._filter] markers_to_exclude = ['CD41'] 2025-08-20 17:39:56 INFO: [embeddings_utils._filter] cell_lines = ['WT'] 2025-08-20 17:39:56 INFO: [embeddings_utils._filter] conditions = ['Untreated'] 2025-08-20 17:39:57 INFO: [load_embeddings] embeddings shape: (87130, 192) 2025-08-20 17:39:57 INFO: [load_embeddings] labels shape: (87130,) 2025-08-20 17:39:57 INFO: [load_embeddings] example label: MitoTracker_WT_Untreated 2025-08-20 17:39:57 INFO: [load_embeddings] paths shape: (87130,)
Batches loaded.
Training on Batches: [1], Testing on: [2].
=== Batch [2] ===
Train: (115590, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (94059, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
precision recall f1-score support
0 0.89 0.93 0.91 2123
1 0.98 0.91 0.95 2536
2 0.95 0.92 0.94 2079
3 1.00 1.00 1.00 24823
4 0.88 0.76 0.81 2319
5 0.81 0.86 0.83 2608
6 0.98 0.92 0.95 2236
7 0.94 0.98 0.96 2265
8 0.96 0.96 0.96 2110
9 0.88 0.88 0.88 2104
10 0.90 0.95 0.93 2243
11 0.96 0.96 0.96 2236
12 1.00 0.98 0.99 2227
13 0.95 0.93 0.94 2360
14 0.92 0.89 0.90 1916
15 0.96 0.94 0.95 2074
16 0.91 0.82 0.86 1818
17 0.89 0.85 0.87 1631
18 0.93 0.96 0.95 2090
19 0.98 0.87 0.92 2019
20 0.86 0.93 0.90 1923
21 0.77 0.80 0.78 1654
22 0.85 0.90 0.87 1934
23 0.90 0.93 0.92 2086
24 0.97 0.94 0.96 2114
25 0.97 1.00 0.98 18531
accuracy 0.95 94059
macro avg 0.92 0.91 0.92 94059
weighted avg 0.95 0.95 0.95 94059
Training on Batches: [1], Testing on: [3].
=== Batch [3] ===
Train: (115590, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
Test: (87130, 192) Labels: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25]
CLTC_WT_Untreated: 2439
DAPI_WT_Untreated: 30429
GM130_WT_Untreated: 2371
PURA_WT_Untreated: 2712
PSD95_WT_Untreated: 2101
SQSTM1_WT_Untreated: 2651
SNCA_WT_Untreated: 2454
G3BP1_WT_Untreated: 2842
PEX14_WT_Untreated: 2505
KIF5A_WT_Untreated: 2622
NEMO_WT_Untreated: 2935
MitoTracker_WT_Untreated: 2728
TIA1_WT_Untreated: 2712
NCL_WT_Untreated: 2709
TUJ1_WT_Untreated: 22601
P54_WT_Untreated: 2623
ANAX11_WT_Untreated: 2614
DCP1A_WT_Untreated: 2364
TOMM20_WT_Untreated: 2363
FMRP_WT_Untreated: 2913
FUS_WT_Untreated: 2728
PML_WT_Untreated: 2297
Calreticulin_WT_Untreated: 3056
TDP43_WT_Untreated: 2535
Phalloidin_WT_Untreated: 2219
LAMP1_WT_Untreated: 3067
precision recall f1-score support
0 0.85 0.87 0.86 1850
1 0.93 0.37 0.53 2044
2 0.95 0.95 0.95 2332
3 0.99 1.00 0.99 22599
4 0.73 0.50 0.59 1901
5 0.37 0.31 0.34 1492
6 0.95 0.78 0.85 2095
7 0.81 0.95 0.88 2384
8 0.94 0.95 0.94 2145
9 0.97 0.84 0.90 2358
10 0.88 0.96 0.92 2340
11 0.94 0.92 0.93 2095
12 0.98 0.98 0.98 2085
13 0.81 0.99 0.89 2117
14 0.92 0.87 0.90 1751
15 0.96 0.84 0.90 1855
16 0.64 0.48 0.55 1623
17 0.70 0.69 0.69 1903
18 0.86 0.98 0.92 2085
19 0.98 0.88 0.93 2152
20 0.58 0.92 0.71 1857
21 0.88 0.59 0.70 1484
22 0.87 0.92 0.90 1836
23 0.84 0.95 0.89 2078
24 0.95 0.96 0.96 2200
25 0.93 0.99 0.96 16469
accuracy 0.90 87130
macro avg 0.85 0.82 0.83 87130
weighted avg 0.90 0.90 0.90 87130
=== Overall Accuracy ===
0.9257885153576774 [0.9502227325402142, 0.9013542981751406]
=== Evaluation Metrics ===
Label Accuracy Sensitivity Specificity PPV NPV
ANAX11_WT_Untreated 0.994922 0.901334 0.997021 0.871502 0.997786
CLTC_WT_Untreated 0.991147 0.669432 0.999490 0.971483 0.991496
Calreticulin_WT_Untreated 0.997213 0.934482 0.998778 0.950207 0.998366
DAPI_WT_Untreated 0.997721 0.999958 0.996927 0.991407 0.999985
DCP1A_WT_Untreated 0.988437 0.642180 0.996694 0.822458 0.991512
FMRP_WT_Untreated 0.985054 0.658780 0.992608 0.673566 0.992104
FUS_WT_Untreated 0.995778 0.853383 0.999265 0.966022 0.996420
G3BP1_WT_Untreated 0.995469 0.963003 0.996324 0.873391 0.999023
GM130_WT_Untreated 0.997676 0.951586 0.998785 0.949578 0.998836
KIF5A_WT_Untreated 0.994834 0.858359 0.998280 0.926463 0.996430
LAMP1_WT_Untreated 0.995971 0.958979 0.996931 0.890217 0.998933
MitoTracker_WT_Untreated 0.997489 0.941122 0.998869 0.953227 0.998559
NCL_WT_Untreated 0.999283 0.979592 0.999763 0.990155 0.999503
NEMO_WT_Untreated 0.995496 0.955551 0.996508 0.873953 0.998871
P54_WT_Untreated 0.996070 0.883283 0.998400 0.919387 0.997591
PEX14_WT_Untreated 0.996904 0.895139 0.999159 0.959356 0.997679
PML_WT_Untreated 0.990253 0.658529 0.996675 0.793140 0.993411
PSD95_WT_Untreated 0.991291 0.762875 0.995835 0.784633 0.995286
PURA_WT_Untreated 0.996722 0.969102 0.997373 0.896919 0.999270
Phalloidin_WT_Untreated 0.996771 0.876528 0.999605 0.981213 0.997098
SNCA_WT_Untreated 0.989911 0.924868 0.991297 0.693651 0.998388
SQSTM1_WT_Untreated 0.991953 0.697897 0.997136 0.811111 0.994689
TDP43_WT_Untreated 0.995049 0.907692 0.996906 0.861748 0.998036
TIA1_WT_Untreated 0.995408 0.941643 0.996673 0.869401 0.998625
TOMM20_WT_Untreated 0.997892 0.951089 0.999033 0.959991 0.998807
TUJ1_WT_Untreated 0.988730 0.992429 0.987845 0.951331 0.998168
Macro Average 0.994363 0.874185 0.997007 0.891750 0.997110
{'Accuracy': 0.9943633018985275,
'Sensitivity': 0.8741852119354661,
'Specificity': 0.9970069021537664,
'PPV': 0.8917503692831067,
'NPV': 0.9971104454643182}